From b3e014c4e8050e8a0b3da190bb327347f9136b7e Mon Sep 17 00:00:00 2001 From: adrian555 Date: Tue, 18 Sep 2018 13:48:56 -0700 Subject: [PATCH 01/12] support eager execution --- R/pkg/R/DataFrame.R | 14 +++-- R/pkg/tests/fulltests/test_eager_execution.R | 58 ++++++++++++++++++++ docs/sparkr.md | 36 ++++++++++++ 3 files changed, 103 insertions(+), 5 deletions(-) create mode 100644 R/pkg/tests/fulltests/test_eager_execution.R diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 458decaf4766f..c60c4600cd74b 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -244,11 +244,15 @@ setMethod("showDF", #' @note show(SparkDataFrame) since 1.4.0 setMethod("show", "SparkDataFrame", function(object) { - cols <- lapply(dtypes(object), function(l) { - paste(l, collapse = ":") - }) - s <- paste(cols, collapse = ", ") - cat(paste(class(object), "[", s, "]\n", sep = "")) + if (identical(sparkR.conf("spark.sql.repl.eagerEval.enabled", "false")[[1]], "true")) { + showDF(object) + } else { + cols <- lapply(dtypes(object), function(l) { + paste(l, collapse = ":") + }) + s <- paste(cols, collapse = ", ") + cat(paste(class(object), "[", s, "]\n", sep = "")) + } }) #' DataTypes diff --git a/R/pkg/tests/fulltests/test_eager_execution.R b/R/pkg/tests/fulltests/test_eager_execution.R new file mode 100644 index 0000000000000..36549c48181a8 --- /dev/null +++ b/R/pkg/tests/fulltests/test_eager_execution.R @@ -0,0 +1,58 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +library(testthat) + +context("Show SparkDataFrame when eager execution is enabled.") + +test_that("eager execution is not enabled", { + # Start Spark session without eager execution enabled + sparkSession <- if (windows_with_hadoop()) { + sparkR.session(master = sparkRTestMaster) + } else { + sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE) + } + + df <- suppressWarnings(createDataFrame(iris)) + expect_is(df, "SparkDataFrame") + expected <- "Sepal_Length:double, Sepal_Width:double, Petal_Length:double, Petal_Width:double, Species:string" + expect_output(show(df), expected) + + # Stop Spark session + sparkR.session.stop() +}) + +test_that("eager execution is enabled", { + # Start Spark session without eager execution enabled + sparkSession <- if (windows_with_hadoop()) { + sparkR.session(master = sparkRTestMaster, + sparkConfig = list(spark.sql.repl.eagerEval.enabled = "true")) + } else { + sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE, + sparkConfig = list(spark.sql.repl.eagerEval.enabled = "true")) + } + + df <- suppressWarnings(createDataFrame(iris)) + expect_is(df, "SparkDataFrame") + expected <- paste0("+------------+-----------+------------+-----------+-------+\n", + "|Sepal_Length|Sepal_Width|Petal_Length|Petal_Width|Species|\n", + "+------------+-----------+------------+-----------+-------+\n") + expect_output(show(df), expected) + + # Stop Spark session + sparkR.session.stop() +}) \ No newline at end of file diff --git a/docs/sparkr.md b/docs/sparkr.md index b4248e8bb21de..9ed1f6b128e09 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -450,6 +450,42 @@ print(model.summaries) {% endhighlight %} +### Eager execution + +If the eager execution is enabled, the data will be returned to R client immediately when the `SparkDataFrame` is created. Eager execution can be enabled by setting the configuration property `spark.sql.repl.eagerEval.enabled` to `true` when the `SparkSession` is started up. + +
+{% highlight r %} + +# Start up spark session with eager execution enabled +sparkR.session(master = "local[*]", sparkConfig = list(spark.sql.repl.eagerEval.enabled = "true")) + +df <- createDataFrame(faithful) + +# Instead of displaying the SparkDataFrame class, displays the data returned +df + +##+---------+-------+ +##|eruptions|waiting| +##+---------+-------+ +##| 3.6| 79.0| +##| 1.8| 54.0| +##| 3.333| 74.0| +##| 2.283| 62.0| +##| 4.533| 85.0| +##| 2.883| 55.0| +##| 4.7| 88.0| +##| 3.6| 85.0| +##| 1.95| 51.0| +##| 4.35| 85.0| +##+---------+-------+ +##only showing top 10 rows + +{% endhighlight %} +
+ +Note that the `SparkSession` created by `sparkR` shell does not have eager execution enabled. You can stop the current session and start up a new session like above to enable. + ## Running SQL Queries from SparkR A SparkDataFrame can also be registered as a temporary view in Spark SQL and that allows you to run SQL queries over its data. The `sql` function enables applications to run SQL queries programmatically and returns the result as a `SparkDataFrame`. From cd8a7041c6eecc59d22db72f4f2065ed9f06640a Mon Sep 17 00:00:00 2001 From: adrian555 Date: Tue, 18 Sep 2018 14:09:48 -0700 Subject: [PATCH 02/12] add newline --- R/pkg/tests/fulltests/test_eager_execution.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/tests/fulltests/test_eager_execution.R b/R/pkg/tests/fulltests/test_eager_execution.R index 36549c48181a8..54d3de809da5e 100644 --- a/R/pkg/tests/fulltests/test_eager_execution.R +++ b/R/pkg/tests/fulltests/test_eager_execution.R @@ -55,4 +55,4 @@ test_that("eager execution is enabled", { # Stop Spark session sparkR.session.stop() -}) \ No newline at end of file +}) From a89bf37e7fbfa089a39e0fb91cdd4a1bdd409b9f Mon Sep 17 00:00:00 2001 From: adrian555 Date: Thu, 20 Sep 2018 13:58:02 -0700 Subject: [PATCH 03/12] address review comment --- R/pkg/R/DataFrame.R | 3 +- R/pkg/tests/fulltests/test_eager_execution.R | 12 ++--- docs/sparkr.md | 52 ++++++++++++-------- 3 files changed, 40 insertions(+), 27 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index c60c4600cd74b..bf40ca06d7aff 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -226,7 +226,8 @@ setMethod("showDF", #' show #' -#' Print class and type information of a Spark object. +#' If eager evaluation is enabled, return the data of the SparkDataFrame object, otherwise, +#' print the class and type information of the SparkDataFrame object. #' #' @param object a Spark object. Can be a SparkDataFrame, Column, GroupedData, WindowSpec. #' diff --git a/R/pkg/tests/fulltests/test_eager_execution.R b/R/pkg/tests/fulltests/test_eager_execution.R index 54d3de809da5e..c31a02dad49bc 100644 --- a/R/pkg/tests/fulltests/test_eager_execution.R +++ b/R/pkg/tests/fulltests/test_eager_execution.R @@ -27,9 +27,9 @@ test_that("eager execution is not enabled", { sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE) } - df <- suppressWarnings(createDataFrame(iris)) + df <- createDataFrame(faithful) expect_is(df, "SparkDataFrame") - expected <- "Sepal_Length:double, Sepal_Width:double, Petal_Length:double, Petal_Width:double, Species:string" + expected <- "eruptions:double, waiting:double" expect_output(show(df), expected) # Stop Spark session @@ -46,11 +46,11 @@ test_that("eager execution is enabled", { sparkConfig = list(spark.sql.repl.eagerEval.enabled = "true")) } - df <- suppressWarnings(createDataFrame(iris)) + df <- createDataFrame(faithful) expect_is(df, "SparkDataFrame") - expected <- paste0("+------------+-----------+------------+-----------+-------+\n", - "|Sepal_Length|Sepal_Width|Petal_Length|Petal_Width|Species|\n", - "+------------+-----------+------------+-----------+-------+\n") + expected <- paste0("+---------+-------+\n", + "|eruptions|waiting|\n", + "+---------+-------+\n") expect_output(show(df), expected) # Stop Spark session diff --git a/docs/sparkr.md b/docs/sparkr.md index 9ed1f6b128e09..5ef9091c287b9 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -460,31 +460,43 @@ If the eager execution is enabled, the data will be returned to R client immedia # Start up spark session with eager execution enabled sparkR.session(master = "local[*]", sparkConfig = list(spark.sql.repl.eagerEval.enabled = "true")) +# Create a grouped and sorted SparkDataFrame df <- createDataFrame(faithful) - -# Instead of displaying the SparkDataFrame class, displays the data returned -df - -##+---------+-------+ -##|eruptions|waiting| -##+---------+-------+ -##| 3.6| 79.0| -##| 1.8| 54.0| -##| 3.333| 74.0| -##| 2.283| 62.0| -##| 4.533| 85.0| -##| 2.883| 55.0| -##| 4.7| 88.0| -##| 3.6| 85.0| -##| 1.95| 51.0| -##| 4.35| 85.0| -##+---------+-------+ -##only showing top 10 rows +df2 <- arrange(summarize(groupBy(df, df$waiting), count = n(df$waiting)), "waiting") + +# Similar to R data.frame, displays the data returned, instead of SparkDataFrame class string +df2 + +##+-------+-----+ +##|waiting|count| +##+-------+-----+ +##| 43.0| 1| +##| 45.0| 3| +##| 46.0| 5| +##| 47.0| 4| +##| 48.0| 3| +##| 49.0| 5| +##| 50.0| 5| +##| 51.0| 6| +##| 52.0| 5| +##| 53.0| 7| +##| 54.0| 9| +##| 55.0| 6| +##| 56.0| 4| +##| 57.0| 3| +##| 58.0| 4| +##| 59.0| 7| +##| 60.0| 6| +##| 62.0| 4| +##| 63.0| 3| +##| 64.0| 4| +##+-------+-----+ +##only showing top 20 rows {% endhighlight %} -Note that the `SparkSession` created by `sparkR` shell does not have eager execution enabled. You can stop the current session and start up a new session like above to enable. +Note that to enable eager execution through `sparkR` command, add `spark.sql.repl.eagerEval.enabled=true` configuration property to the `--conf` option. ## Running SQL Queries from SparkR A SparkDataFrame can also be registered as a temporary view in Spark SQL and that allows you to run SQL queries over its data. From a083d64ea3210c367cc4d63ad51a1c6beab129e7 Mon Sep 17 00:00:00 2001 From: adrian555 Date: Thu, 20 Sep 2018 15:03:30 -0700 Subject: [PATCH 04/12] address review comment --- R/pkg/tests/fulltests/test_eager_execution.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/tests/fulltests/test_eager_execution.R b/R/pkg/tests/fulltests/test_eager_execution.R index c31a02dad49bc..8d68838140bcf 100644 --- a/R/pkg/tests/fulltests/test_eager_execution.R +++ b/R/pkg/tests/fulltests/test_eager_execution.R @@ -37,7 +37,7 @@ test_that("eager execution is not enabled", { }) test_that("eager execution is enabled", { - # Start Spark session without eager execution enabled + # Start Spark session with eager execution enabled sparkSession <- if (windows_with_hadoop()) { sparkR.session(master = sparkRTestMaster, sparkConfig = list(spark.sql.repl.eagerEval.enabled = "true")) From 7b121e65b99e177d8870b4e098797f1f1e86ce65 Mon Sep 17 00:00:00 2001 From: adrian555 Date: Thu, 20 Sep 2018 17:02:31 -0700 Subject: [PATCH 05/12] address review comment --- R/pkg/R/DataFrame.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index bf40ca06d7aff..5e7a4833d329f 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -226,8 +226,8 @@ setMethod("showDF", #' show #' -#' If eager evaluation is enabled, return the data of the SparkDataFrame object, otherwise, -#' print the class and type information of the SparkDataFrame object. +#' If eager evaluation is enabled and the Spark object is a SparkDataFrame, return the data of +#' the SparkDataFrame object, otherwise, print the class and type information of the Spark object. #' #' @param object a Spark object. Can be a SparkDataFrame, Column, GroupedData, WindowSpec. #' From 4492b278ba5a4721d6a5dc836436191ad155dfc6 Mon Sep 17 00:00:00 2001 From: adrian555 Date: Fri, 21 Sep 2018 10:36:33 -0700 Subject: [PATCH 06/12] address review comment --- R/pkg/R/DataFrame.R | 17 ++++++++++++++--- R/pkg/tests/fulltests/test_eager_execution.R | 11 +++++++---- docs/sparkr.md | 18 ++++++------------ .../apache/spark/sql/internal/SQLConf.scala | 7 ++++--- 4 files changed, 31 insertions(+), 22 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 5e7a4833d329f..4364d87c83184 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -226,8 +226,9 @@ setMethod("showDF", #' show #' -#' If eager evaluation is enabled and the Spark object is a SparkDataFrame, return the data of -#' the SparkDataFrame object, otherwise, print the class and type information of the Spark object. +#' If eager evaluation is enabled and the Spark object is a SparkDataFrame, evaluate the +#' SparkDataFrame and print top rows of the SparkDataFrame, otherwise, print the class +#' and type information of the Spark object. #' #' @param object a Spark object. Can be a SparkDataFrame, Column, GroupedData, WindowSpec. #' @@ -246,7 +247,17 @@ setMethod("showDF", setMethod("show", "SparkDataFrame", function(object) { if (identical(sparkR.conf("spark.sql.repl.eagerEval.enabled", "false")[[1]], "true")) { - showDF(object) + argsList <- list() + argsList$x <- object + numRows <- as.numeric(sparkR.conf("spark.sql.repl.eagerEval.maxNumRows", "0")[[1]]) + if (numRows > 0) { + argsList$numRows <- numRows + } + truncate <- as.numeric(sparkR.conf("spark.sql.repl.eagerEval.truncate", "0")[[1]]) + if (truncate > 0) { + argsList$truncate <- truncate + } + do.call(showDF, argsList) } else { cols <- lapply(dtypes(object), function(l) { paste(l, collapse = ":") diff --git a/R/pkg/tests/fulltests/test_eager_execution.R b/R/pkg/tests/fulltests/test_eager_execution.R index 8d68838140bcf..c4fa6f1d79e6a 100644 --- a/R/pkg/tests/fulltests/test_eager_execution.R +++ b/R/pkg/tests/fulltests/test_eager_execution.R @@ -40,17 +40,20 @@ test_that("eager execution is enabled", { # Start Spark session with eager execution enabled sparkSession <- if (windows_with_hadoop()) { sparkR.session(master = sparkRTestMaster, - sparkConfig = list(spark.sql.repl.eagerEval.enabled = "true")) + sparkConfig = list(spark.sql.repl.eagerEval.enabled = "true", + spark.sql.repl.eagerEval.maxNumRows = as.integer(10))) } else { sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE, - sparkConfig = list(spark.sql.repl.eagerEval.enabled = "true")) + sparkConfig = list(spark.sql.repl.eagerEval.enabled = "true", + spark.sql.repl.eagerEval.maxNumRows = as.integer(10))) } df <- createDataFrame(faithful) expect_is(df, "SparkDataFrame") - expected <- paste0("+---------+-------+\n", + expected <- paste0("(+---------+-------+\n", "|eruptions|waiting|\n", - "+---------+-------+\n") + "+---------+-------+\n)*", + "(only showing top 10 rows)") expect_output(show(df), expected) # Stop Spark session diff --git a/docs/sparkr.md b/docs/sparkr.md index 5ef9091c287b9..5816b30563e2d 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -454,11 +454,15 @@ print(model.summaries) If the eager execution is enabled, the data will be returned to R client immediately when the `SparkDataFrame` is created. Eager execution can be enabled by setting the configuration property `spark.sql.repl.eagerEval.enabled` to `true` when the `SparkSession` is started up. +Maximum number of rows and maximum number of characters per column of data to display can be controlled by `spark.sql.repl.eagerEval.maxNumRows` and `spark.sql.repl.eagerEval.truncate` configuration properties, respectively. +
{% highlight r %} # Start up spark session with eager execution enabled -sparkR.session(master = "local[*]", sparkConfig = list(spark.sql.repl.eagerEval.enabled = "true")) +sparkR.session(master = "local[*]", + sparkConfig = list(spark.sql.repl.eagerEval.enabled = "true", + spark.sql.repl.eagerEval.maxNumRows = as.integer(10))) # Create a grouped and sorted SparkDataFrame df <- createDataFrame(faithful) @@ -480,18 +484,8 @@ df2 ##| 51.0| 6| ##| 52.0| 5| ##| 53.0| 7| -##| 54.0| 9| -##| 55.0| 6| -##| 56.0| 4| -##| 57.0| 3| -##| 58.0| 4| -##| 59.0| 7| -##| 60.0| 6| -##| 62.0| 4| -##| 63.0| 3| -##| 64.0| 4| ##+-------+-----+ -##only showing top 20 rows +##only showing top 10 rows {% endhighlight %}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 4928560eacb1c..bd76a7d79d365 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -1491,9 +1491,10 @@ object SQLConf { val REPL_EAGER_EVAL_ENABLED = buildConf("spark.sql.repl.eagerEval.enabled") .doc("Enables eager evaluation or not. When true, the top K rows of Dataset will be " + "displayed if and only if the REPL supports the eager evaluation. Currently, the " + - "eager evaluation is only supported in PySpark. For the notebooks like Jupyter, " + - "the HTML table (generated by _repr_html_) will be returned. For plain Python REPL, " + - "the returned outputs are formatted like dataframe.show().") + "eager evaluation is supported in PySpark and SparkR. In PySpark, for the notebooks like " + + "Jupyter, the HTML table (generated by _repr_html_) will be returned. For plain Python " + + "REPL, the returned outputs are formatted like dataframe.show(). In SparkR, the returned " + + "outputs are showed as R data.frame.") .booleanConf .createWithDefault(false) From 100bac715ee3fc6baee960dfa7c9466baa032bcb Mon Sep 17 00:00:00 2001 From: adrian555 Date: Mon, 24 Sep 2018 11:16:41 -0700 Subject: [PATCH 07/12] address review comment --- R/pkg/R/DataFrame.R | 20 +++++++++++++------- R/pkg/tests/fulltests/test_eager_execution.R | 9 ++++----- docs/sparkr.md | 4 ++-- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 4364d87c83184..896e970b40124 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -246,16 +246,22 @@ setMethod("showDF", #' @note show(SparkDataFrame) since 1.4.0 setMethod("show", "SparkDataFrame", function(object) { - if (identical(sparkR.conf("spark.sql.repl.eagerEval.enabled", "false")[[1]], "true")) { + allConf <- sparkR.conf() + if (!is.null(allConf[["spark.sql.repl.eagerEval.enabled"]]) && + identical(allConf[["spark.sql.repl.eagerEval.enabled"]], "true")) { argsList <- list() argsList$x <- object - numRows <- as.numeric(sparkR.conf("spark.sql.repl.eagerEval.maxNumRows", "0")[[1]]) - if (numRows > 0) { - argsList$numRows <- numRows + if (!is.null(allConf[["spark.sql.repl.eagerEval.maxNumRows"]])) { + numRows <- as.numeric(allConf[["spark.sql.repl.eagerEval.maxNumRows"]]) + if (numRows > 0) { + argsList$numRows <- numRows + } } - truncate <- as.numeric(sparkR.conf("spark.sql.repl.eagerEval.truncate", "0")[[1]]) - if (truncate > 0) { - argsList$truncate <- truncate + if (!is.null(allConf[["spark.sql.repl.eagerEval.truncate"]])) { + truncate <- as.numeric(allConf[["spark.sql.repl.eagerEval.truncate"]]) + if (truncate > 0) { + argsList$truncate <- truncate + } } do.call(showDF, argsList) } else { diff --git a/R/pkg/tests/fulltests/test_eager_execution.R b/R/pkg/tests/fulltests/test_eager_execution.R index c4fa6f1d79e6a..3ef9832d26ca4 100644 --- a/R/pkg/tests/fulltests/test_eager_execution.R +++ b/R/pkg/tests/fulltests/test_eager_execution.R @@ -38,14 +38,13 @@ test_that("eager execution is not enabled", { test_that("eager execution is enabled", { # Start Spark session with eager execution enabled + sparkConfig <- list(spark.sql.repl.eagerEval.enabled = "true", + spark.sql.repl.eagerEval.maxNumRows = as.integer(10)) sparkSession <- if (windows_with_hadoop()) { - sparkR.session(master = sparkRTestMaster, - sparkConfig = list(spark.sql.repl.eagerEval.enabled = "true", - spark.sql.repl.eagerEval.maxNumRows = as.integer(10))) + sparkR.session(master = sparkRTestMaster, sparkConfig = sparkConfig) } else { sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE, - sparkConfig = list(spark.sql.repl.eagerEval.enabled = "true", - spark.sql.repl.eagerEval.maxNumRows = as.integer(10))) + sparkConfig = sparkConfig) } df <- createDataFrame(faithful) diff --git a/docs/sparkr.md b/docs/sparkr.md index 5816b30563e2d..96b171c144bfe 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -452,9 +452,9 @@ print(model.summaries) ### Eager execution -If the eager execution is enabled, the data will be returned to R client immediately when the `SparkDataFrame` is created. Eager execution can be enabled by setting the configuration property `spark.sql.repl.eagerEval.enabled` to `true` when the `SparkSession` is started up. +If eager execution is enabled, the data will be returned to R client immediately when the `SparkDataFrame` is created. By default, eager execution is not enabled and can be enabled by setting the configuration property `spark.sql.repl.eagerEval.enabled` to `true` when the `SparkSession` is started up. -Maximum number of rows and maximum number of characters per column of data to display can be controlled by `spark.sql.repl.eagerEval.maxNumRows` and `spark.sql.repl.eagerEval.truncate` configuration properties, respectively. +Maximum number of rows and maximum number of characters per column of data to display can be controlled by `spark.sql.repl.eagerEval.maxNumRows` and `spark.sql.repl.eagerEval.truncate` configuration properties, respectively. These properties are only effective when eager execution is enabled. If these properties are not set explicitly, by default, data up to 20 rows and up to 20 characters per column will be showed.
{% highlight r %} From 57eb00824b072b2a326810ff42edef2ca2626eb6 Mon Sep 17 00:00:00 2001 From: adrian555 Date: Tue, 25 Sep 2018 10:51:34 -0700 Subject: [PATCH 08/12] address review comment --- R/pkg/R/DataFrame.R | 14 ++++++++------ R/pkg/tests/fulltests/test_eager_execution.R | 14 +++----------- docs/sparkr.md | 2 +- .../org/apache/spark/sql/internal/SQLConf.scala | 2 +- 4 files changed, 13 insertions(+), 19 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 896e970b40124..404bb627ce338 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -247,18 +247,20 @@ setMethod("showDF", setMethod("show", "SparkDataFrame", function(object) { allConf <- sparkR.conf() - if (!is.null(allConf[["spark.sql.repl.eagerEval.enabled"]]) && - identical(allConf[["spark.sql.repl.eagerEval.enabled"]], "true")) { + prop <- allConf[["spark.sql.repl.eagerEval.enabled"]] + if (!is.null(prop) && identical(prop, "true")) { argsList <- list() argsList$x <- object - if (!is.null(allConf[["spark.sql.repl.eagerEval.maxNumRows"]])) { - numRows <- as.numeric(allConf[["spark.sql.repl.eagerEval.maxNumRows"]]) + prop <- allConf[["spark.sql.repl.eagerEval.maxNumRows"]] + if (!is.null(prop)) { + numRows <- as.numeric(prop) if (numRows > 0) { argsList$numRows <- numRows } } - if (!is.null(allConf[["spark.sql.repl.eagerEval.truncate"]])) { - truncate <- as.numeric(allConf[["spark.sql.repl.eagerEval.truncate"]]) + prop <- allConf[["spark.sql.repl.eagerEval.truncate"]] + if (!is.null(prop)) { + truncate <- as.numeric(prop) if (truncate > 0) { argsList$truncate <- truncate } diff --git a/R/pkg/tests/fulltests/test_eager_execution.R b/R/pkg/tests/fulltests/test_eager_execution.R index 3ef9832d26ca4..3652150627a3d 100644 --- a/R/pkg/tests/fulltests/test_eager_execution.R +++ b/R/pkg/tests/fulltests/test_eager_execution.R @@ -21,11 +21,7 @@ context("Show SparkDataFrame when eager execution is enabled.") test_that("eager execution is not enabled", { # Start Spark session without eager execution enabled - sparkSession <- if (windows_with_hadoop()) { - sparkR.session(master = sparkRTestMaster) - } else { - sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE) - } + sparkR.session(master = sparkRTestMaster) df <- createDataFrame(faithful) expect_is(df, "SparkDataFrame") @@ -40,12 +36,8 @@ test_that("eager execution is enabled", { # Start Spark session with eager execution enabled sparkConfig <- list(spark.sql.repl.eagerEval.enabled = "true", spark.sql.repl.eagerEval.maxNumRows = as.integer(10)) - sparkSession <- if (windows_with_hadoop()) { - sparkR.session(master = sparkRTestMaster, sparkConfig = sparkConfig) - } else { - sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE, - sparkConfig = sparkConfig) - } + + sparkR.session(master = sparkRTestMaster, sparkConfig = sparkConfig) df <- createDataFrame(faithful) expect_is(df, "SparkDataFrame") diff --git a/docs/sparkr.md b/docs/sparkr.md index 96b171c144bfe..d43caca5f40e3 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -490,7 +490,7 @@ df2 {% endhighlight %}
-Note that to enable eager execution through `sparkR` command, add `spark.sql.repl.eagerEval.enabled=true` configuration property to the `--conf` option. +Note that to enable eager execution in `sparkR` shell, add `spark.sql.repl.eagerEval.enabled=true` configuration property to the `--conf` option. ## Running SQL Queries from SparkR A SparkDataFrame can also be registered as a temporary view in Spark SQL and that allows you to run SQL queries over its data. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index bd76a7d79d365..80121afd79f0a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -1494,7 +1494,7 @@ object SQLConf { "eager evaluation is supported in PySpark and SparkR. In PySpark, for the notebooks like " + "Jupyter, the HTML table (generated by _repr_html_) will be returned. For plain Python " + "REPL, the returned outputs are formatted like dataframe.show(). In SparkR, the returned " + - "outputs are showed as R data.frame.") + "outputs are showed similar to R data.frame would.") .booleanConf .createWithDefault(false) From df582fc8d31c4e993a6e215fa0901a4728affc0e Mon Sep 17 00:00:00 2001 From: adrian555 Date: Wed, 26 Sep 2018 16:06:03 -0700 Subject: [PATCH 09/12] add sparkr.SparkDataFrame.base_show_func option --- R/pkg/R/DataFrame.R | 56 +++++++++++++++------------ R/pkg/tests/fulltests/test_sparkSQL.R | 12 ++++++ 2 files changed, 44 insertions(+), 24 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 404bb627ce338..7fde8bae3727b 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -229,6 +229,8 @@ setMethod("showDF", #' If eager evaluation is enabled and the Spark object is a SparkDataFrame, evaluate the #' SparkDataFrame and print top rows of the SparkDataFrame, otherwise, print the class #' and type information of the Spark object. +#' If the Spark object is a SparkDataFrame, default \emph{show} method may be replaced by +#' the function set with \strong{options(sparkr.SparkDataFrame.base_show_func=show_func)}. #' #' @param object a Spark object. Can be a SparkDataFrame, Column, GroupedData, WindowSpec. #' @@ -246,32 +248,38 @@ setMethod("showDF", #' @note show(SparkDataFrame) since 1.4.0 setMethod("show", "SparkDataFrame", function(object) { - allConf <- sparkR.conf() - prop <- allConf[["spark.sql.repl.eagerEval.enabled"]] - if (!is.null(prop) && identical(prop, "true")) { - argsList <- list() - argsList$x <- object - prop <- allConf[["spark.sql.repl.eagerEval.maxNumRows"]] - if (!is.null(prop)) { - numRows <- as.numeric(prop) - if (numRows > 0) { - argsList$numRows <- numRows + showFunc <- getOption("sparkr.SparkDataFrame.base_show_func") + if (!is.null(showFunc)) { + stopifnot(typeof(showFunc) == "closure") + showFunc(object) + } else { + allConf <- sparkR.conf() + prop <- allConf[["spark.sql.repl.eagerEval.enabled"]] + if (!is.null(prop) && identical(prop, "true")) { + argsList <- list() + argsList$x <- object + prop <- allConf[["spark.sql.repl.eagerEval.maxNumRows"]] + if (!is.null(prop)) { + numRows <- as.numeric(prop) + if (numRows > 0) { + argsList$numRows <- numRows + } } - } - prop <- allConf[["spark.sql.repl.eagerEval.truncate"]] - if (!is.null(prop)) { - truncate <- as.numeric(prop) - if (truncate > 0) { - argsList$truncate <- truncate + prop <- allConf[["spark.sql.repl.eagerEval.truncate"]] + if (!is.null(prop)) { + truncate <- as.numeric(prop) + if (truncate > 0) { + argsList$truncate <- truncate + } } - } - do.call(showDF, argsList) - } else { - cols <- lapply(dtypes(object), function(l) { - paste(l, collapse = ":") - }) - s <- paste(cols, collapse = ", ") - cat(paste(class(object), "[", s, "]\n", sep = "")) + do.call(showDF, argsList) + } else { + cols <- lapply(dtypes(object), function(l) { + paste(l, collapse = ":") + }) + s <- paste(cols, collapse = ", ") + cat(paste(class(object), "[", s, "]\n", sep = "")) + } } }) diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R index 0c4bdb31b027b..4720c9064e2fe 100644 --- a/R/pkg/tests/fulltests/test_sparkSQL.R +++ b/R/pkg/tests/fulltests/test_sparkSQL.R @@ -3711,6 +3711,18 @@ test_that("catalog APIs, listTables, listColumns, listFunctions", { dropTempView("cars") }) +test_that("Set default show() method for SparkDataFrame", { + df <- createDataFrame(faithful) + expected <- "eruptions:double, waiting:double" + expect_output(show(df), expected) + + options(sparkr.SparkDataFrame.base_show_func = + function(x) {print(paste0("Class of ", class(x)))}) + expected <- "Class of SparkDataFrame" + expect_output(show(df), expected) + options(sparkr.SparkDataFrame.base_show_func = NULL) +}) + compare_list <- function(list1, list2) { # get testthat to show the diff by first making the 2 lists equal in length expect_equal(length(list1), length(list2)) From 44df922fb347211c9a962bf1771e2e0005634305 Mon Sep 17 00:00:00 2001 From: adrian555 Date: Wed, 3 Oct 2018 13:27:05 -0700 Subject: [PATCH 10/12] address review comment --- R/pkg/tests/fulltests/test_eager_execution.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/pkg/tests/fulltests/test_eager_execution.R b/R/pkg/tests/fulltests/test_eager_execution.R index 3652150627a3d..b860b993e1a6a 100644 --- a/R/pkg/tests/fulltests/test_eager_execution.R +++ b/R/pkg/tests/fulltests/test_eager_execution.R @@ -21,7 +21,7 @@ context("Show SparkDataFrame when eager execution is enabled.") test_that("eager execution is not enabled", { # Start Spark session without eager execution enabled - sparkR.session(master = sparkRTestMaster) + sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE) df <- createDataFrame(faithful) expect_is(df, "SparkDataFrame") @@ -37,7 +37,7 @@ test_that("eager execution is enabled", { sparkConfig <- list(spark.sql.repl.eagerEval.enabled = "true", spark.sql.repl.eagerEval.maxNumRows = as.integer(10)) - sparkR.session(master = sparkRTestMaster, sparkConfig = sparkConfig) + sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE, sparkConfig = sparkConfig) df <- createDataFrame(faithful) expect_is(df, "SparkDataFrame") From 76767f9e5288d593efedda000c7fc01799d43085 Mon Sep 17 00:00:00 2001 From: adrian555 Date: Thu, 18 Oct 2018 11:05:47 -0700 Subject: [PATCH 11/12] address review comment --- R/pkg/R/DataFrame.R | 56 ++++++++++++--------------- R/pkg/tests/fulltests/test_sparkSQL.R | 11 ------ 2 files changed, 24 insertions(+), 43 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index a71d3315307fb..14b1154c24b39 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -229,8 +229,6 @@ setMethod("showDF", #' If eager evaluation is enabled and the Spark object is a SparkDataFrame, evaluate the #' SparkDataFrame and print top rows of the SparkDataFrame, otherwise, print the class #' and type information of the Spark object. -#' If the Spark object is a SparkDataFrame, default \emph{show} method may be replaced by -#' the function set with \strong{options(sparkr.SparkDataFrame.base_show_func=show_func)}. #' #' @param object a Spark object. Can be a SparkDataFrame, Column, GroupedData, WindowSpec. #' @@ -248,38 +246,32 @@ setMethod("showDF", #' @note show(SparkDataFrame) since 1.4.0 setMethod("show", "SparkDataFrame", function(object) { - showFunc <- getOption("sparkr.SparkDataFrame.base_show_func") - if (!is.null(showFunc)) { - stopifnot(typeof(showFunc) == "closure") - showFunc(object) - } else { - allConf <- sparkR.conf() - prop <- allConf[["spark.sql.repl.eagerEval.enabled"]] - if (!is.null(prop) && identical(prop, "true")) { - argsList <- list() - argsList$x <- object - prop <- allConf[["spark.sql.repl.eagerEval.maxNumRows"]] - if (!is.null(prop)) { - numRows <- as.numeric(prop) - if (numRows > 0) { - argsList$numRows <- numRows - } + allConf <- sparkR.conf() + prop <- allConf[["spark.sql.repl.eagerEval.enabled"]] + if (!is.null(prop) && identical(prop, "true")) { + argsList <- list() + argsList$x <- object + prop <- allConf[["spark.sql.repl.eagerEval.maxNumRows"]] + if (!is.null(prop)) { + numRows <- as.numeric(prop) + if (numRows > 0) { + argsList$numRows <- numRows } - prop <- allConf[["spark.sql.repl.eagerEval.truncate"]] - if (!is.null(prop)) { - truncate <- as.numeric(prop) - if (truncate > 0) { - argsList$truncate <- truncate - } + } + prop <- allConf[["spark.sql.repl.eagerEval.truncate"]] + if (!is.null(prop)) { + truncate <- as.numeric(prop) + if (truncate > 0) { + argsList$truncate <- truncate } - do.call(showDF, argsList) - } else { - cols <- lapply(dtypes(object), function(l) { - paste(l, collapse = ":") - }) - s <- paste(cols, collapse = ", ") - cat(paste(class(object), "[", s, "]\n", sep = "")) - } + } + do.call(showDF, argsList) + } else { + cols <- lapply(dtypes(object), function(l) { + paste(l, collapse = ":") + }) + s <- paste(cols, collapse = ", ") + cat(paste(class(object), "[", s, "]\n", sep = "")) } }) diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R index 8f78e10f40e4a..91d90127a3190 100644 --- a/R/pkg/tests/fulltests/test_sparkSQL.R +++ b/R/pkg/tests/fulltests/test_sparkSQL.R @@ -3731,17 +3731,6 @@ test_that("catalog APIs, listTables, listColumns, listFunctions", { dropTempView("cars") }) -test_that("Set default show() method for SparkDataFrame", { - df <- createDataFrame(faithful) - expected <- "eruptions:double, waiting:double" - expect_output(show(df), expected) - - options(sparkr.SparkDataFrame.base_show_func = - function(x) {print(paste0("Class of ", class(x)))}) - expected <- "Class of SparkDataFrame" - expect_output(show(df), expected) - options(sparkr.SparkDataFrame.base_show_func = NULL) -}) compare_list <- function(list1, list2) { # get testthat to show the diff by first making the 2 lists equal in length From b5e463f8babe75463054fe1224348bb707afc195 Mon Sep 17 00:00:00 2001 From: adrian555 Date: Tue, 23 Oct 2018 10:52:17 -0700 Subject: [PATCH 12/12] address review comment --- R/pkg/R/DataFrame.R | 4 +-- R/pkg/tests/fulltests/test_sparkSQL.R | 1 - ...ager_execution.R => test_sparkSQL_eager.R} | 28 ++++++++++++++++--- 3 files changed, 26 insertions(+), 7 deletions(-) rename R/pkg/tests/fulltests/{test_eager_execution.R => test_sparkSQL_eager.R} (66%) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 14b1154c24b39..bf82d0c7882d7 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -253,14 +253,14 @@ setMethod("show", "SparkDataFrame", argsList$x <- object prop <- allConf[["spark.sql.repl.eagerEval.maxNumRows"]] if (!is.null(prop)) { - numRows <- as.numeric(prop) + numRows <- as.integer(prop) if (numRows > 0) { argsList$numRows <- numRows } } prop <- allConf[["spark.sql.repl.eagerEval.truncate"]] if (!is.null(prop)) { - truncate <- as.numeric(prop) + truncate <- as.integer(prop) if (truncate > 0) { argsList$truncate <- truncate } diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R index 91d90127a3190..5ad5d78d3ed17 100644 --- a/R/pkg/tests/fulltests/test_sparkSQL.R +++ b/R/pkg/tests/fulltests/test_sparkSQL.R @@ -3731,7 +3731,6 @@ test_that("catalog APIs, listTables, listColumns, listFunctions", { dropTempView("cars") }) - compare_list <- function(list1, list2) { # get testthat to show the diff by first making the 2 lists equal in length expect_equal(length(list1), length(list2)) diff --git a/R/pkg/tests/fulltests/test_eager_execution.R b/R/pkg/tests/fulltests/test_sparkSQL_eager.R similarity index 66% rename from R/pkg/tests/fulltests/test_eager_execution.R rename to R/pkg/tests/fulltests/test_sparkSQL_eager.R index b860b993e1a6a..df7354fa063e9 100644 --- a/R/pkg/tests/fulltests/test_eager_execution.R +++ b/R/pkg/tests/fulltests/test_sparkSQL_eager.R @@ -17,7 +17,7 @@ library(testthat) -context("Show SparkDataFrame when eager execution is enabled.") +context("test show SparkDataFrame when eager execution is enabled.") test_that("eager execution is not enabled", { # Start Spark session without eager execution enabled @@ -34,8 +34,7 @@ test_that("eager execution is not enabled", { test_that("eager execution is enabled", { # Start Spark session with eager execution enabled - sparkConfig <- list(spark.sql.repl.eagerEval.enabled = "true", - spark.sql.repl.eagerEval.maxNumRows = as.integer(10)) + sparkConfig <- list(spark.sql.repl.eagerEval.enabled = "true") sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE, sparkConfig = sparkConfig) @@ -44,7 +43,28 @@ test_that("eager execution is enabled", { expected <- paste0("(+---------+-------+\n", "|eruptions|waiting|\n", "+---------+-------+\n)*", - "(only showing top 10 rows)") + "(only showing top 20 rows)") + expect_output(show(df), expected) + + # Stop Spark session + sparkR.session.stop() +}) + +test_that("eager execution is enabled with maxNumRows and truncate set", { + # Start Spark session with eager execution enabled + sparkConfig <- list(spark.sql.repl.eagerEval.enabled = "true", + spark.sql.repl.eagerEval.maxNumRows = as.integer(5), + spark.sql.repl.eagerEval.truncate = as.integer(2)) + + sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE, sparkConfig = sparkConfig) + + df <- arrange(createDataFrame(faithful), "waiting") + expect_is(df, "SparkDataFrame") + expected <- paste0("(+---------+-------+\n", + "|eruptions|waiting|\n", + "+---------+-------+\n", + "| 1.| 43|\n)*", + "(only showing top 5 rows)") expect_output(show(df), expected) # Stop Spark session