From efc2f6634b54cd91e4946d4d4e04be769769f4ad Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Mon, 7 Mar 2016 16:19:07 -0800 Subject: [PATCH 01/22] Added histogram function --- R/pkg/NAMESPACE | 1 + R/pkg/R/functions.R | 111 ++++++++++++++++++++++ R/pkg/R/generics.R | 4 + R/pkg/inst/tests/testthat/test_sparkSQL.R | 28 ++++++ 4 files changed, 144 insertions(+) diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 6a3d63f43f785..ccaee147a227e 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -164,6 +164,7 @@ exportMethods("%in%", "getItem", "greatest", "hex", + "histogram", "hour", "hypot", "ifelse", diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index e5521f3cffadf..711489cdf586d 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -2638,3 +2638,114 @@ setMethod("sort_array", jc <- callJStatic("org.apache.spark.sql.functions", "sort_array", x@jc, asc) column(jc) }) + +#' This function renders a histogram for a given SparkR Column. Note: This function depends on the ggplot2 package. +#' +#' @name histogram +#' @title Histogram +#' @param nbins the number of bins (optional). The default is 10. +#' @param df the DataFrame containing the Column to build the histogram from. +#' @param colname the name of the column to build the histogram from. +#' @return a ggplot object with the histogram +#' @examples \dontrun{ +#' +#' # Create a DataFrame from the Irijjs dataset +#' irisDF <- createDataFrame(sqlContext, iris) +#' +#' # Render a histogram for the Sepal_Length column +#' histogram(irisDF, "Sepal_Length", nbins=12) +#' } +# TODO: the name of this function will be changed to hist to match R's after SPARK-9325 is fixed +setMethod("histogram", + signature(df="DataFrame"), + function (df, colname, nbins) { + # Require ggplot library + require(ggplot2) + + # Compute statistics for the histogram + if (missing(nbins)) { + histData <- hist.stats(df, colname) + } else { + histData <- hist.stats(df, colname, nbins) + } + + # Get the target column name + targetColname <- histData[[1]] + + # Get the input data for the plot + histStats <- histData[[2]] + + # Render the plot from the stats. Add axis labels. + plot <- ggplot(histStats, aes(x = centroids, y = counts)) + plot <- plot + geom_histogram(data = histStats, stat = "identity", binwidth = 100) + plot <- plot + xlab(targetColname) + ylab("Frequency") + + return(plot) + }) + +#' This function computes statistics to render a histogram. These are +#' centroids and counts. +#' +#' @name hist.stats +#' @title Compute histogram statistics +#' @param formula a formula or a bigr.vector specifying the data source +#' @param nbins the number of bins (optional) +#' @param data optional bigr.frame. If specified, this serves as the +#' environment for column references in the formula that don't have an +#' explicit bigr.frame. +#' @return a data.frame with the computed statistics +#' @seealso \link{bigr.histogram} +hist.stats <- function(df, colname=NULL, nbins=10) { + # Validate nbins + if (nbins < 2) { + stop("The number of bins must be a positive integer number greater than 1.") + } + + # Validate colname + if (is.null(colname) | is.na(colname)) { + stop("colname must be specified.") + } + if (!colname %in% colnames(df)) { + stop("Specified colname does not belong to the given DataFrame.") + } + + # Filter null values (i.e., NA's) in all grouping columns as well as in the target column + df <- na.omit(df[, colname]) + + # TODO: This will be when improved SPARK-9325 or SPARK-13436 are fixed + x <- eval(parse(text=paste0("df$", colname))) + + stats <- collect(describe(df[, colname])) + min <- as.numeric(stats[4,2]) + max <- as.numeric(stats[5,2]) + + # Normalize the data + xnorm <- (x - min) / (max - min) + + # Round the data to 4 significant digits. This is to avoid rounding issues. + xnorm <- cast(xnorm * 10000, "integer") / 10000.0 + + # Since min = 0, max = 1 (data is already normalized) + normBinSize <- 1 / nbins + binsize <- (max - min) / nbins + approxBins <- xnorm / normBinSize + + # Adjust values that are equal to the upper bound of each bin + bins <- cast(approxBins - + ifelse(approxBins == cast(approxBins, "integer") & x != min, 1, 0), "integer") + + df$bins <- bins + histStats <- collect(count(groupBy(df, "bins"))) + names(histStats) <- c("bins", "counts") + + # Fill bins with zero counts + y <- data.frame("bins"=seq(0, nbins - 1)) + histStats <- merge(histStats, y, all.x=T, all.y=T) + histStats[is.na(histStats$count), 2] <- 0 + + # Compute centroids + histStats$centroids <- histStats$bins * binsize + min + binsize / 2 + + # Return the statistics + return(list(colname, histStats)) +} diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index ab61bce03df23..055321184d3f9 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -99,6 +99,10 @@ setGeneric("getJRDD", function(rdd, ...) { standardGeneric("getJRDD") }) # @export setGeneric("glom", function(x) { standardGeneric("glom") }) +# @rdname histogram +# @export +setGeneric("histogram", function(df, colname, nbins) { standardGeneric("histogram") }) + # @rdname keyBy # @export setGeneric("keyBy", function(x, func) { standardGeneric("keyBy") }) diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index cc118108f61cc..2b9b1ab5abe97 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1897,6 +1897,34 @@ test_that("Method str()", { expect_equal(capture.output(utils:::str(iris)), capture.output(str(iris))) }) +test_that("Histogram", { + + # Basic histogram test + expect_equal( + all(histogram(irisDF, "Petal_Width", 8)$data == + data.frame(bins=seq(0, 7), + counts=c(48, 2, 7, 21, 24, 19, 15, 14), + centroids=seq(0,7) * 0.3 + 0.25)), + TRUE) + + # Missing nbins + expect_equal(length(histogram(irisDF, "Petal_Width")$data$counts), 10) + + # Wrong colname + expect_error(histogram(irisDF, "xxx"), + "Specified colname does not belong to the given DataFrame.") + + # Invalid nbins + expect_error(histogram(irisDF, "Petal_Width", nbins=0), + "The number of bins must be a positive integer number greater than 1.") + + # Test against R's hist + expect_equal(all(hist(iris$Sepal.Width)$counts == + histogram(irisDF, "Sepal_Width", 12)$data$counts), T) + + # Test when there are zero counts + df <- as.DataFrame(sqlContext, data.frame(x=c(1,2,3,4,100))) +}) unlink(parquetPath) unlink(jsonPath) unlink(jsonPathNa) From 0ad424bbcd03bf4c57566dbe92e537db213ba187 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Mon, 7 Mar 2016 16:43:46 -0800 Subject: [PATCH 02/22] Added test case where some bins have zero counts --- R/pkg/inst/tests/testthat/test_sparkSQL.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 2b9b1ab5abe97..999671f3d3548 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1924,6 +1924,7 @@ test_that("Histogram", { # Test when there are zero counts df <- as.DataFrame(sqlContext, data.frame(x=c(1,2,3,4,100))) + expect_equal(histogram(df, "x")$data$counts, c(4, 0, 0, 0, 0, 0, 0, 0, 0, 1)) }) unlink(parquetPath) unlink(jsonPath) From d19992b4ec5141221cbf8724dc592b09e541039b Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Mon, 7 Mar 2016 17:25:15 -0800 Subject: [PATCH 03/22] Added check for ggplot2 package --- R/pkg/inst/tests/testthat/test_sparkSQL.R | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 999671f3d3548..793c55131672e 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1899,6 +1899,12 @@ test_that("Method str()", { test_that("Histogram", { + # If ggplot2 is not installed, install it + if (!("ggplot2" %in% installed.packages()[, 1])) { + install.packages("ggplot2", repos = "http://cran.us.r-project.org") + } + library(ggplot2) + # Basic histogram test expect_equal( all(histogram(irisDF, "Petal_Width", 8)$data == From ac8f4c9ca56b592c32c60dc945023050df89bdb4 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Mon, 7 Mar 2016 19:13:36 -0800 Subject: [PATCH 04/22] Suppressed warnings for loading ggplot --- R/pkg/inst/tests/testthat/test_sparkSQL.R | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 793c55131672e..c8e5782329ea9 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1899,11 +1899,13 @@ test_that("Method str()", { test_that("Histogram", { - # If ggplot2 is not installed, install it - if (!("ggplot2" %in% installed.packages()[, 1])) { - install.packages("ggplot2", repos = "http://cran.us.r-project.org") - } - library(ggplot2) + suppressWarnings({ + # If ggplot2 is not installed, install it + if (!("ggplot2" %in% installed.packages()[, 1])) { + install.packages("ggplot2", repos = "http://cran.us.r-project.org") + } + library(ggplot2) + }) # Basic histogram test expect_equal( From 125b82dd2b58490ed66b9d8dd6855c8db8faee8d Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Mon, 21 Mar 2016 17:14:21 -0700 Subject: [PATCH 05/22] Modified histogram to remove ggplot2 dependency --- R/pkg/DESCRIPTION | 5 +- R/pkg/R/functions.R | 153 +++++++++------------- R/pkg/R/generics.R | 2 +- R/pkg/inst/tests/testthat/test_sparkSQL.R | 16 +-- 4 files changed, 68 insertions(+), 108 deletions(-) diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION index 465bc37788e5d..0cd0d75df0f70 100644 --- a/R/pkg/DESCRIPTION +++ b/R/pkg/DESCRIPTION @@ -18,10 +18,10 @@ Collate: 'schema.R' 'generics.R' 'jobj.R' - 'RDD.R' - 'pairRDD.R' 'column.R' 'group.R' + 'RDD.R' + 'pairRDD.R' 'DataFrame.R' 'SQLContext.R' 'backend.R' @@ -36,3 +36,4 @@ Collate: 'stats.R' 'types.R' 'utils.R' +RoxygenNote: 5.0.1 diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 711489cdf586d..c413d5e4c2d4f 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -2639,113 +2639,80 @@ setMethod("sort_array", column(jc) }) -#' This function renders a histogram for a given SparkR Column. Note: This function depends on the ggplot2 package. +#' This function computes a histogram for a given SparkR Column. #' #' @name histogram #' @title Histogram #' @param nbins the number of bins (optional). The default is 10. #' @param df the DataFrame containing the Column to build the histogram from. #' @param colname the name of the column to build the histogram from. -#' @return a ggplot object with the histogram +#' @return a data.frame with the histogram statistics, i.e., counts and centroids. #' @examples \dontrun{ #' -#' # Create a DataFrame from the Irijjs dataset +#' # Create a DataFrame from the Iris dataset #' irisDF <- createDataFrame(sqlContext, iris) #' -#' # Render a histogram for the Sepal_Length column -#' histogram(irisDF, "Sepal_Length", nbins=12) -#' } -# TODO: the name of this function will be changed to hist to match R's after SPARK-9325 is fixed +#' # Compute histogram statistics +#' histData <- histogram(df, "colname"Sepal_Length", nbins=12) +#' +#' # Once SparkR has computed the histogram statistics, it would be very easy to +#' # render the histogram using R's visualization packages such as ggplot2. +#' +#' } setMethod("histogram", - signature(df="DataFrame"), - function (df, colname, nbins) { - # Require ggplot library - require(ggplot2) - - # Compute statistics for the histogram - if (missing(nbins)) { - histData <- hist.stats(df, colname) - } else { - histData <- hist.stats(df, colname, nbins) + signature(df = "DataFrame"), + function(df, colname, nbins=10) { + # Validate nbins + if (nbins < 2) { + stop("The number of bins must be a positive integer number greater than 1.") } - # Get the target column name - targetColname <- histData[[1]] + # Validate colname + if (is.null(colname) | is.na(colname)) { + stop("colname must be specified.") + } + if (!colname %in% names(df)) { + stop("Specified colname does not belong to the given DataFrame.") + } - # Get the input data for the plot - histStats <- histData[[2]] + # Filter NA values in the target column + df <- na.omit(df[, colname]) - # Render the plot from the stats. Add axis labels. - plot <- ggplot(histStats, aes(x = centroids, y = counts)) - plot <- plot + geom_histogram(data = histStats, stat = "identity", binwidth = 100) - plot <- plot + xlab(targetColname) + ylab("Frequency") + # TODO: This will be when improved SPARK-9325 or SPARK-13436 are fixed + x <- eval(parse(text=paste0("df$", colname))) - return(plot) - }) + stats <- collect(describe(df[, colname])) + min <- as.numeric(stats[4,2]) + max <- as.numeric(stats[5,2]) -#' This function computes statistics to render a histogram. These are -#' centroids and counts. -#' -#' @name hist.stats -#' @title Compute histogram statistics -#' @param formula a formula or a bigr.vector specifying the data source -#' @param nbins the number of bins (optional) -#' @param data optional bigr.frame. If specified, this serves as the -#' environment for column references in the formula that don't have an -#' explicit bigr.frame. -#' @return a data.frame with the computed statistics -#' @seealso \link{bigr.histogram} -hist.stats <- function(df, colname=NULL, nbins=10) { - # Validate nbins - if (nbins < 2) { - stop("The number of bins must be a positive integer number greater than 1.") - } - - # Validate colname - if (is.null(colname) | is.na(colname)) { - stop("colname must be specified.") - } - if (!colname %in% colnames(df)) { - stop("Specified colname does not belong to the given DataFrame.") - } - - # Filter null values (i.e., NA's) in all grouping columns as well as in the target column - df <- na.omit(df[, colname]) - - # TODO: This will be when improved SPARK-9325 or SPARK-13436 are fixed - x <- eval(parse(text=paste0("df$", colname))) - - stats <- collect(describe(df[, colname])) - min <- as.numeric(stats[4,2]) - max <- as.numeric(stats[5,2]) - - # Normalize the data - xnorm <- (x - min) / (max - min) - - # Round the data to 4 significant digits. This is to avoid rounding issues. - xnorm <- cast(xnorm * 10000, "integer") / 10000.0 - - # Since min = 0, max = 1 (data is already normalized) - normBinSize <- 1 / nbins - binsize <- (max - min) / nbins - approxBins <- xnorm / normBinSize - - # Adjust values that are equal to the upper bound of each bin - bins <- cast(approxBins - - ifelse(approxBins == cast(approxBins, "integer") & x != min, 1, 0), "integer") - - df$bins <- bins - histStats <- collect(count(groupBy(df, "bins"))) - names(histStats) <- c("bins", "counts") - - # Fill bins with zero counts - y <- data.frame("bins"=seq(0, nbins - 1)) - histStats <- merge(histStats, y, all.x=T, all.y=T) - histStats[is.na(histStats$count), 2] <- 0 - - # Compute centroids - histStats$centroids <- histStats$bins * binsize + min + binsize / 2 - - # Return the statistics - return(list(colname, histStats)) -} + # Normalize the data + xnorm <- (x - min) / (max - min) + + # Round the data to 4 significant digits. This is to avoid rounding issues. + xnorm <- cast(xnorm * 10000, "integer") / 10000.0 + + # Since min = 0, max = 1 (data is already normalized) + normBinSize <- 1 / nbins + binsize <- (max - min) / nbins + approxBins <- xnorm / normBinSize + + # Adjust values that are equal to the upper bound of each bin + bins <- cast(approxBins - + ifelse(approxBins == cast(approxBins, "integer") & x != min, 1, 0), + "integer") + + df$bins <- bins + histStats <- collect(count(groupBy(df, "bins"))) + names(histStats) <- c("bins", "counts") + + # Fill bins with zero counts + y <- data.frame("bins"=seq(0, nbins - 1)) + histStats <- merge(histStats, y, all.x=T, all.y=T) + histStats[is.na(histStats$count), 2] <- 0 + + # Compute centroids + histStats$centroids <- histStats$bins * binsize + min + binsize / 2 + + # Return the statistics + return(histStats) + }) diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 055321184d3f9..6bc1042323a6b 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -101,7 +101,7 @@ setGeneric("glom", function(x) { standardGeneric("glom") }) # @rdname histogram # @export -setGeneric("histogram", function(df, colname, nbins) { standardGeneric("histogram") }) +setGeneric("histogram", function(df, colname, nbins=10) { standardGeneric("histogram") }) # @rdname keyBy # @export diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index c8e5782329ea9..17b01055711a7 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1899,24 +1899,16 @@ test_that("Method str()", { test_that("Histogram", { - suppressWarnings({ - # If ggplot2 is not installed, install it - if (!("ggplot2" %in% installed.packages()[, 1])) { - install.packages("ggplot2", repos = "http://cran.us.r-project.org") - } - library(ggplot2) - }) - # Basic histogram test expect_equal( - all(histogram(irisDF, "Petal_Width", 8)$data == + all(histogram(irisDF, "Petal_Width", 8) == data.frame(bins=seq(0, 7), counts=c(48, 2, 7, 21, 24, 19, 15, 14), centroids=seq(0,7) * 0.3 + 0.25)), TRUE) # Missing nbins - expect_equal(length(histogram(irisDF, "Petal_Width")$data$counts), 10) + expect_equal(length(histogram(irisDF, "Petal_Width")$counts), 10) # Wrong colname expect_error(histogram(irisDF, "xxx"), @@ -1928,11 +1920,11 @@ test_that("Histogram", { # Test against R's hist expect_equal(all(hist(iris$Sepal.Width)$counts == - histogram(irisDF, "Sepal_Width", 12)$data$counts), T) + histogram(irisDF, "Sepal_Width", 12)$counts), T) # Test when there are zero counts df <- as.DataFrame(sqlContext, data.frame(x=c(1,2,3,4,100))) - expect_equal(histogram(df, "x")$data$counts, c(4, 0, 0, 0, 0, 0, 0, 0, 0, 1)) + expect_equal(histogram(df, "x")$counts, c(4, 0, 0, 0, 0, 0, 0, 0, 0, 1)) }) unlink(parquetPath) unlink(jsonPath) From 971d3060f411c0d481ef9258a1c27a17d949fa3f Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Tue, 22 Mar 2016 15:26:13 -0700 Subject: [PATCH 06/22] Fixed style issues --- R/pkg/R/functions.R | 10 +++++----- R/pkg/inst/tests/testthat/test_sparkSQL.R | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index c413d5e4c2d4f..0ff0489548f06 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -2653,7 +2653,7 @@ setMethod("sort_array", #' irisDF <- createDataFrame(sqlContext, iris) #' #' # Compute histogram statistics -#' histData <- histogram(df, "colname"Sepal_Length", nbins=12) +#' histData <- histogram(df, "colname"Sepal_Length", nbins = 12) #' #' # Once SparkR has computed the histogram statistics, it would be very easy to #' # render the histogram using R's visualization packages such as ggplot2. @@ -2661,7 +2661,7 @@ setMethod("sort_array", #' } setMethod("histogram", signature(df = "DataFrame"), - function(df, colname, nbins=10) { + function(df, colname, nbins = 10) { # Validate nbins if (nbins < 2) { stop("The number of bins must be a positive integer number greater than 1.") @@ -2679,7 +2679,7 @@ setMethod("histogram", df <- na.omit(df[, colname]) # TODO: This will be when improved SPARK-9325 or SPARK-13436 are fixed - x <- eval(parse(text=paste0("df$", colname))) + x <- eval(parse(text = paste0("df$", colname))) stats <- collect(describe(df[, colname])) min <- as.numeric(stats[4,2]) @@ -2706,8 +2706,8 @@ setMethod("histogram", names(histStats) <- c("bins", "counts") # Fill bins with zero counts - y <- data.frame("bins"=seq(0, nbins - 1)) - histStats <- merge(histStats, y, all.x=T, all.y=T) + y <- data.frame("bins" = seq(0, nbins - 1)) + histStats <- merge(histStats, y, all.x = T, all.y = T) histStats[is.na(histStats$count), 2] <- 0 # Compute centroids diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 17b01055711a7..3e459ab113b8b 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1902,9 +1902,9 @@ test_that("Histogram", { # Basic histogram test expect_equal( all(histogram(irisDF, "Petal_Width", 8) == - data.frame(bins=seq(0, 7), - counts=c(48, 2, 7, 21, 24, 19, 15, 14), - centroids=seq(0,7) * 0.3 + 0.25)), + data.frame(bins = seq(0, 7), + counts = c(48, 2, 7, 21, 24, 19, 15, 14), + centroids = seq(0,7) * 0.3 + 0.25)), TRUE) # Missing nbins @@ -1915,7 +1915,7 @@ test_that("Histogram", { "Specified colname does not belong to the given DataFrame.") # Invalid nbins - expect_error(histogram(irisDF, "Petal_Width", nbins=0), + expect_error(histogram(irisDF, "Petal_Width", nbins = 0), "The number of bins must be a positive integer number greater than 1.") # Test against R's hist @@ -1923,7 +1923,7 @@ test_that("Histogram", { histogram(irisDF, "Sepal_Width", 12)$counts), T) # Test when there are zero counts - df <- as.DataFrame(sqlContext, data.frame(x=c(1,2,3,4,100))) + df <- as.DataFrame(sqlContext, data.frame(x = c(1,2,3,4,100))) expect_equal(histogram(df, "x")$counts, c(4, 0, 0, 0, 0, 0, 0, 0, 0, 1)) }) unlink(parquetPath) From c06344efa7508c72871cf1c48c3a2246fc4475d5 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Tue, 22 Mar 2016 15:57:49 -0700 Subject: [PATCH 07/22] Fixed style issues --- R/pkg/R/functions.R | 4 ++-- R/pkg/inst/tests/testthat/test_sparkSQL.R | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 0ff0489548f06..284634b148aa7 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -2682,8 +2682,8 @@ setMethod("histogram", x <- eval(parse(text = paste0("df$", colname))) stats <- collect(describe(df[, colname])) - min <- as.numeric(stats[4,2]) - max <- as.numeric(stats[5,2]) + min <- as.numeric(stats[4, 2]) + max <- as.numeric(stats[5, 2]) # Normalize the data xnorm <- (x - min) / (max - min) diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 3e459ab113b8b..0407430b32d04 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1904,7 +1904,7 @@ test_that("Histogram", { all(histogram(irisDF, "Petal_Width", 8) == data.frame(bins = seq(0, 7), counts = c(48, 2, 7, 21, 24, 19, 15, 14), - centroids = seq(0,7) * 0.3 + 0.25)), + centroids = seq(0, 7) * 0.3 + 0.25)), TRUE) # Missing nbins @@ -1923,7 +1923,7 @@ test_that("Histogram", { histogram(irisDF, "Sepal_Width", 12)$counts), T) # Test when there are zero counts - df <- as.DataFrame(sqlContext, data.frame(x = c(1,2,3,4,100))) + df <- as.DataFrame(sqlContext, data.frame(x = c(1, 2, 3, 4, 100))) expect_equal(histogram(df, "x")$counts, c(4, 0, 0, 0, 0, 0, 0, 0, 0, 1)) }) unlink(parquetPath) From 468adbf4ce1a4b367b8b8513d0b8623d23c79e60 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Wed, 23 Mar 2016 17:28:14 -0700 Subject: [PATCH 08/22] Added example to render the histogram with ggplot2, and added documentation tags --- R/pkg/R/functions.R | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 284634b148aa7..299f75c00a769 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -2643,21 +2643,28 @@ setMethod("sort_array", #' #' @name histogram #' @title Histogram -#' @param nbins the number of bins (optional). The default is 10. +#' @param nbins the number of bins (optional). Default value is 10. #' @param df the DataFrame containing the Column to build the histogram from. #' @param colname the name of the column to build the histogram from. #' @return a data.frame with the histogram statistics, i.e., counts and centroids. -#' @examples \dontrun{ -#' +#' @rdname histogram +#' @family agg_funcs +#' @export +#' @examples +#' \dontrun{ #' # Create a DataFrame from the Iris dataset #' irisDF <- createDataFrame(sqlContext, iris) #' #' # Compute histogram statistics #' histData <- histogram(df, "colname"Sepal_Length", nbins = 12) #' -#' # Once SparkR has computed the histogram statistics, it would be very easy to -#' # render the histogram using R's visualization packages such as ggplot2. -#' +#' # Once SparkR has computed the histogram statistics, the histogram can be +#' # rendered using the ggplot2 library: +#' +#' require(ggplot2) +#' plot <- ggplot(histStats, aes(x = centroids, y = counts)) +#' plot <- plot + geom_histogram(data = histStats, stat = "identity", binwidth = 100) +#' plot <- plot + xlab("Sepal_Length") + ylab("Frequency") #' } setMethod("histogram", signature(df = "DataFrame"), From dbc9d75584b7ae3ab952c7a88b19b21ad00a5a82 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Thu, 24 Mar 2016 09:46:42 -0700 Subject: [PATCH 09/22] Round nbins to the smallest integer --- R/pkg/R/functions.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 299f75c00a769..ab4ed74733b3e 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -2674,6 +2674,9 @@ setMethod("histogram", stop("The number of bins must be a positive integer number greater than 1.") } + # Round nbins to the smallest integer + nbins <- floor(nbins) + # Validate colname if (is.null(colname) | is.na(colname)) { stop("colname must be specified.") From 19f995c8f72efb58b818f81a122529680c24f5ec Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Fri, 25 Mar 2016 16:48:33 -0700 Subject: [PATCH 10/22] Added support for Columns --- R/pkg/R/functions.R | 35 ++++++++++++++--------- R/pkg/R/generics.R | 2 +- R/pkg/inst/tests/testthat/test_sparkSQL.R | 18 +++++++++++- 3 files changed, 40 insertions(+), 15 deletions(-) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index ab4ed74733b3e..d49765c8ce9b6 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -2667,8 +2667,8 @@ setMethod("sort_array", #' plot <- plot + xlab("Sepal_Length") + ylab("Frequency") #' } setMethod("histogram", - signature(df = "DataFrame"), - function(df, colname, nbins = 10) { + signature(df = "DataFrame", col="characterOrColumn"), + function(df, col, nbins = 10) { # Validate nbins if (nbins < 2) { stop("The number of bins must be a positive integer number greater than 1.") @@ -2677,19 +2677,28 @@ setMethod("histogram", # Round nbins to the smallest integer nbins <- floor(nbins) - # Validate colname - if (is.null(colname) | is.na(colname)) { - stop("colname must be specified.") + # Validate col + if (is.null(col)) { + stop("col must be specified.") } - if (!colname %in% names(df)) { - stop("Specified colname does not belong to the given DataFrame.") - } - - # Filter NA values in the target column - df <- na.omit(df[, colname]) - # TODO: This will be when improved SPARK-9325 or SPARK-13436 are fixed - x <- eval(parse(text = paste0("df$", colname))) + colname <- col + x <- if (class(col) == "character") { + if (!colname %in% names(df)) { + stop("Specified colname does not belong to the given DataFrame.") + } + + # Filter NA values in the target column + df <- na.omit(df[, colname]) + + # TODO: This will be when improved SPARK-9325 or SPARK-13436 are fixed + eval(parse(text = paste0("df$", colname))) + } else if (class(col) == "Column") { + # Append the given column to the dataset + df$x <- col + colname <- "x" + col + } stats <- collect(describe(df[, colname])) min <- as.numeric(stats[4, 2]) diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 6bc1042323a6b..323b4f3d307b4 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -101,7 +101,7 @@ setGeneric("glom", function(x) { standardGeneric("glom") }) # @rdname histogram # @export -setGeneric("histogram", function(df, colname, nbins=10) { standardGeneric("histogram") }) +setGeneric("histogram", function(df, col, nbins=10) { standardGeneric("histogram") }) # @rdname keyBy # @export diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 0407430b32d04..9bd6d79f91127 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1899,7 +1899,7 @@ test_that("Method str()", { test_that("Histogram", { - # Basic histogram test + # Basic histogram test with colname expect_equal( all(histogram(irisDF, "Petal_Width", 8) == data.frame(bins = seq(0, 7), @@ -1907,6 +1907,22 @@ test_that("Histogram", { centroids = seq(0, 7) * 0.3 + 0.25)), TRUE) + # Basic histogram test with Column + expect_equal( + all(histogram(irisDF, irisDF$Petal_Width, 8) == + data.frame(bins = seq(0, 7), + counts = c(48, 2, 7, 21, 24, 19, 15, 14), + centroids = seq(0, 7) * 0.3 + 0.25)), + TRUE) + + # Basic histogram test with derived column + expect_equal( + all(round(histogram(irisDF, irisDF$Petal_Width + 1, 8), 2) == + data.frame(bins = seq(0, 7), + counts = c(48, 2, 7, 21, 24, 19, 15, 14), + centroids = seq(0, 7) * 0.3 + 1.25)), + TRUE) + # Missing nbins expect_equal(length(histogram(irisDF, "Petal_Width")$counts), 10) From 2800492e307253d7f6004944b2e4beb11f76c330 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Fri, 25 Mar 2016 16:53:43 -0700 Subject: [PATCH 11/22] Fixed style --- R/pkg/R/functions.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index d49765c8ce9b6..e81805262d919 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -2667,7 +2667,7 @@ setMethod("sort_array", #' plot <- plot + xlab("Sepal_Length") + ylab("Frequency") #' } setMethod("histogram", - signature(df = "DataFrame", col="characterOrColumn"), + signature(df = "DataFrame", col = "characterOrColumn"), function(df, col, nbins = 10) { # Validate nbins if (nbins < 2) { From adc34461a869d4b4c072952b999c896047e994d6 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Tue, 19 Apr 2016 11:09:58 -0700 Subject: [PATCH 12/22] Removed Roxygen auto generated comment --- R/pkg/DESCRIPTION | 1 - 1 file changed, 1 deletion(-) diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION index 0cd0d75df0f70..bea3ed80e4166 100644 --- a/R/pkg/DESCRIPTION +++ b/R/pkg/DESCRIPTION @@ -36,4 +36,3 @@ Collate: 'stats.R' 'types.R' 'utils.R' -RoxygenNote: 5.0.1 From 046b7dad841bbf13d1d4b93bf001474f74b25865 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Tue, 19 Apr 2016 14:14:11 -0700 Subject: [PATCH 13/22] Added na.omit to the case when input is a Column --- R/pkg/R/functions.R | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index e81805262d919..3ae29a7990ef0 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -2688,18 +2688,26 @@ setMethod("histogram", stop("Specified colname does not belong to the given DataFrame.") } - # Filter NA values in the target column + # Filter NA values in the target column and remove all other columns df <- na.omit(df[, colname]) # TODO: This will be when improved SPARK-9325 or SPARK-13436 are fixed - eval(parse(text = paste0("df$", colname))) + getColumn(df, colname) + #eval(parse(text = paste0("df$", colname))) } else if (class(col) == "Column") { - # Append the given column to the dataset + # Append the given column to the dataset. This is to support Columns that + # don't belong to the DataFrame but are rather expressions df$x <- col + + # Filter NA values in the target column. Cannot remove all other columns + # since given Column may be an expression on one or more existing columns + df <- na.omit(df) + colname <- "x" col } + # At this point, df only has one column: the one to compute the histogram from stats <- collect(describe(df[, colname])) min <- as.numeric(stats[4, 2]) max <- as.numeric(stats[5, 2]) From b03c335a6dc9818f54fc2633fb149f9f3ad0277d Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Tue, 19 Apr 2016 14:22:54 -0700 Subject: [PATCH 14/22] Added na.omit to the case when input is a Column --- R/pkg/R/functions.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 3ae29a7990ef0..f64f078f3de13 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -2693,7 +2693,6 @@ setMethod("histogram", # TODO: This will be when improved SPARK-9325 or SPARK-13436 are fixed getColumn(df, colname) - #eval(parse(text = paste0("df$", colname))) } else if (class(col) == "Column") { # Append the given column to the dataset. This is to support Columns that # don't belong to the DataFrame but are rather expressions From c2c4601b1a09e23e5b1be64fb027b92f9638da20 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Thu, 21 Apr 2016 13:55:13 -0700 Subject: [PATCH 15/22] Moved histogram function to DataFrame.R --- R/pkg/R/DataFrame.R | 104 ++++++++++++++++++++++++++++++++++++++++++++ R/pkg/R/functions.R | 104 -------------------------------------------- 2 files changed, 104 insertions(+), 104 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 3b7b8250b94f7..f97202ff69e22 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2448,3 +2448,107 @@ setMethod("drop", function(x) { base::drop(x) }) + +#' This function computes a histogram for a given SparkR Column. +#' +#' @name histogram +#' @title Histogram +#' @param nbins the number of bins (optional). Default value is 10. +#' @param df the DataFrame containing the Column to build the histogram from. +#' @param colname the name of the column to build the histogram from. +#' @return a data.frame with the histogram statistics, i.e., counts and centroids. +#' @rdname histogram +#' @family agg_funcs +#' @export +#' @examples +#' \dontrun{ +#' # Create a DataFrame from the Iris dataset +#' irisDF <- createDataFrame(sqlContext, iris) +#' +#' # Compute histogram statistics +#' histData <- histogram(df, "colname"Sepal_Length", nbins = 12) +#' +#' # Once SparkR has computed the histogram statistics, the histogram can be +#' # rendered using the ggplot2 library: +#' +#' require(ggplot2) +#' plot <- ggplot(histStats, aes(x = centroids, y = counts)) +#' plot <- plot + geom_histogram(data = histStats, stat = "identity", binwidth = 100) +#' plot <- plot + xlab("Sepal_Length") + ylab("Frequency") +#' } +setMethod("histogram", + signature(df = "DataFrame", col = "characterOrColumn"), + function(df, col, nbins = 10) { + # Validate nbins + if (nbins < 2) { + stop("The number of bins must be a positive integer number greater than 1.") + } + + # Round nbins to the smallest integer + nbins <- floor(nbins) + + # Validate col + if (is.null(col)) { + stop("col must be specified.") + } + + colname <- col + x <- if (class(col) == "character") { + if (!colname %in% names(df)) { + stop("Specified colname does not belong to the given DataFrame.") + } + + # Filter NA values in the target column and remove all other columns + df <- na.omit(df[, colname]) + getColumn(df, colname) + + } else if (class(col) == "Column") { + + # Append the given column to the dataset. This is to support Columns that + # don't belong to the DataFrame but are rather expressions + df$x <- col + + # Filter NA values in the target column. Cannot remove all other columns + # since given Column may be an expression on one or more existing columns + df <- na.omit(df) + + colname <- "x" + col + } + + # At this point, df only has one column: the one to compute the histogram from + stats <- collect(describe(df[, colname])) + min <- as.numeric(stats[4, 2]) + max <- as.numeric(stats[5, 2]) + + # Normalize the data + xnorm <- (x - min) / (max - min) + + # Round the data to 4 significant digits. This is to avoid rounding issues. + xnorm <- cast(xnorm * 10000, "integer") / 10000.0 + + # Since min = 0, max = 1 (data is already normalized) + normBinSize <- 1 / nbins + binsize <- (max - min) / nbins + approxBins <- xnorm / normBinSize + + # Adjust values that are equal to the upper bound of each bin + bins <- cast(approxBins - + ifelse(approxBins == cast(approxBins, "integer") & x != min, 1, 0), + "integer") + + df$bins <- bins + histStats <- collect(count(groupBy(df, "bins"))) + names(histStats) <- c("bins", "counts") + + # Fill bins with zero counts + y <- data.frame("bins" = seq(0, nbins - 1)) + histStats <- merge(histStats, y, all.x = T, all.y = T) + histStats[is.na(histStats$count), 2] <- 0 + + # Compute centroids + histStats$centroids <- histStats$bins * binsize + min + binsize / 2 + + # Return the statistics + return(histStats) + }) \ No newline at end of file diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index f64f078f3de13..e5521f3cffadf 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -2638,107 +2638,3 @@ setMethod("sort_array", jc <- callJStatic("org.apache.spark.sql.functions", "sort_array", x@jc, asc) column(jc) }) - -#' This function computes a histogram for a given SparkR Column. -#' -#' @name histogram -#' @title Histogram -#' @param nbins the number of bins (optional). Default value is 10. -#' @param df the DataFrame containing the Column to build the histogram from. -#' @param colname the name of the column to build the histogram from. -#' @return a data.frame with the histogram statistics, i.e., counts and centroids. -#' @rdname histogram -#' @family agg_funcs -#' @export -#' @examples -#' \dontrun{ -#' # Create a DataFrame from the Iris dataset -#' irisDF <- createDataFrame(sqlContext, iris) -#' -#' # Compute histogram statistics -#' histData <- histogram(df, "colname"Sepal_Length", nbins = 12) -#' -#' # Once SparkR has computed the histogram statistics, the histogram can be -#' # rendered using the ggplot2 library: -#' -#' require(ggplot2) -#' plot <- ggplot(histStats, aes(x = centroids, y = counts)) -#' plot <- plot + geom_histogram(data = histStats, stat = "identity", binwidth = 100) -#' plot <- plot + xlab("Sepal_Length") + ylab("Frequency") -#' } -setMethod("histogram", - signature(df = "DataFrame", col = "characterOrColumn"), - function(df, col, nbins = 10) { - # Validate nbins - if (nbins < 2) { - stop("The number of bins must be a positive integer number greater than 1.") - } - - # Round nbins to the smallest integer - nbins <- floor(nbins) - - # Validate col - if (is.null(col)) { - stop("col must be specified.") - } - - colname <- col - x <- if (class(col) == "character") { - if (!colname %in% names(df)) { - stop("Specified colname does not belong to the given DataFrame.") - } - - # Filter NA values in the target column and remove all other columns - df <- na.omit(df[, colname]) - - # TODO: This will be when improved SPARK-9325 or SPARK-13436 are fixed - getColumn(df, colname) - } else if (class(col) == "Column") { - # Append the given column to the dataset. This is to support Columns that - # don't belong to the DataFrame but are rather expressions - df$x <- col - - # Filter NA values in the target column. Cannot remove all other columns - # since given Column may be an expression on one or more existing columns - df <- na.omit(df) - - colname <- "x" - col - } - - # At this point, df only has one column: the one to compute the histogram from - stats <- collect(describe(df[, colname])) - min <- as.numeric(stats[4, 2]) - max <- as.numeric(stats[5, 2]) - - # Normalize the data - xnorm <- (x - min) / (max - min) - - # Round the data to 4 significant digits. This is to avoid rounding issues. - xnorm <- cast(xnorm * 10000, "integer") / 10000.0 - - # Since min = 0, max = 1 (data is already normalized) - normBinSize <- 1 / nbins - binsize <- (max - min) / nbins - approxBins <- xnorm / normBinSize - - # Adjust values that are equal to the upper bound of each bin - bins <- cast(approxBins - - ifelse(approxBins == cast(approxBins, "integer") & x != min, 1, 0), - "integer") - - df$bins <- bins - histStats <- collect(count(groupBy(df, "bins"))) - names(histStats) <- c("bins", "counts") - - # Fill bins with zero counts - y <- data.frame("bins" = seq(0, nbins - 1)) - histStats <- merge(histStats, y, all.x = T, all.y = T) - histStats[is.na(histStats$count), 2] <- 0 - - # Compute centroids - histStats$centroids <- histStats$bins * binsize + min + binsize / 2 - - # Return the statistics - return(histStats) - }) From fc4c536ca55fe4beefc27139dad03093cff7194e Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Fri, 22 Apr 2016 10:37:52 -0700 Subject: [PATCH 16/22] Minor docs fix --- R/pkg/R/DataFrame.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 5646d3a93f84d..780009453f001 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2474,7 +2474,7 @@ setMethod("drop", #' @param colname the name of the column to build the histogram from. #' @return a data.frame with the histogram statistics, i.e., counts and centroids. #' @rdname histogram -#' @family agg_funcs +#' @family DataFrame functions #' @export #' @examples #' \dontrun{ From 976e412e7cdcbee95164f05eaf088e5ec7b08160 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Mon, 25 Apr 2016 14:47:04 -0700 Subject: [PATCH 17/22] pkg/R/DataFrame.R --- R/pkg/R/DataFrame.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index afadf8406eaaa..38ca550f3ee8b 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2469,7 +2469,6 @@ setMethod("drop", base::drop(x) }) -<<<<<<< HEAD #' This function computes a histogram for a given SparkR Column. #' #' @name histogram From 96714fdede2dd42f348357ded908305589aecc91 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Tue, 26 Apr 2016 12:28:29 -0700 Subject: [PATCH 18/22] Added dynamic colname generation to avoid colliding with existing columns --- R/pkg/R/DataFrame.R | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 38ca550f3ee8b..88e5c9b096241 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2478,7 +2478,7 @@ setMethod("drop", #' @param colname the name of the column to build the histogram from. #' @return a data.frame with the histogram statistics, i.e., counts and centroids. #' @rdname histogram -#' @family DataFrame functions +#' @family SparkDataFrame functions #' @export #' @examples #' \dontrun{ @@ -2486,7 +2486,7 @@ setMethod("drop", #' irisDF <- createDataFrame(sqlContext, iris) #' #' # Compute histogram statistics -#' histData <- histogram(df, "colname"Sepal_Length", nbins = 12) +#' histData <- histogram(df, df$Sepal_Length, nbins = 12) #' #' # Once SparkR has computed the histogram statistics, the histogram can be #' # rendered using the ggplot2 library: @@ -2497,7 +2497,7 @@ setMethod("drop", #' plot <- plot + xlab("Sepal_Length") + ylab("Frequency") #' } setMethod("histogram", - signature(df = "DataFrame", col = "characterOrColumn"), + signature(df = "SparkDataFrame", col = "characterOrColumn"), function(df, col, nbins = 10) { # Validate nbins if (nbins < 2) { @@ -2524,15 +2524,30 @@ setMethod("histogram", } else if (class(col) == "Column") { + # The given column needs to be appended to the SparkDataFrame so that we can + # use method describe() to compute statistics in one single pass. The new + # column must have a name that doesn't exist in the dataset. + # To do so, we generate a random column name with more characters than the + # longest colname in the dataset, but no more than 100 (think of a UUID). + # This column name will never be visible to the user, so the name is irrelevant. + # Limiting the colname length to 100 makes debugging easier and it does + # introduce a negligible probability of collision: assuming the user has 1 million + # columns AND all of them have names 100 characters long (which is very unlikely), + # AND they run 1 billion histograms, the probability of collision will roughly be + # 1 in 4.4 x 10 ^ 96 + colname <- paste(base:::sample(c(letters, LETTERS), + size = min(max(nchar(colnames(df))) + 1, 100), + replace=TRUE), + collapse="") + # Append the given column to the dataset. This is to support Columns that # don't belong to the DataFrame but are rather expressions - df$x <- col + df <- withColumn(df, colname, col) # Filter NA values in the target column. Cannot remove all other columns # since given Column may be an expression on one or more existing columns df <- na.omit(df) - colname <- "x" col } From cd7ba4c3af26beba4ac4c0f09ea6f3560069d5a4 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Tue, 26 Apr 2016 14:14:41 -0700 Subject: [PATCH 19/22] Fixed ggplot example --- R/pkg/R/DataFrame.R | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 88e5c9b096241..6946caef7f95e 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2482,19 +2482,20 @@ setMethod("drop", #' @export #' @examples #' \dontrun{ +#' #' # Create a DataFrame from the Iris dataset #' irisDF <- createDataFrame(sqlContext, iris) #' #' # Compute histogram statistics -#' histData <- histogram(df, df$Sepal_Length, nbins = 12) +#' histStats <- histogram(irisDF, irisDF$Sepal_Length, nbins = 12) #' #' # Once SparkR has computed the histogram statistics, the histogram can be #' # rendered using the ggplot2 library: #' #' require(ggplot2) -#' plot <- ggplot(histStats, aes(x = centroids, y = counts)) -#' plot <- plot + geom_histogram(data = histStats, stat = "identity", binwidth = 100) -#' plot <- plot + xlab("Sepal_Length") + ylab("Frequency") +#' plot <- ggplot(histStats, aes(x = centroids, y = counts)) + +#' geom_bar(stat = "identity") + +#' xlab("Sepal_Length") + ylab("Frequency") #' } setMethod("histogram", signature(df = "SparkDataFrame", col = "characterOrColumn"), From e9dbc5b27c258777a539723e0ad4676db928736b Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Tue, 26 Apr 2016 14:24:28 -0700 Subject: [PATCH 20/22] Fixed style issues --- R/pkg/R/DataFrame.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 6946caef7f95e..577089c7c3c4f 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2538,8 +2538,8 @@ setMethod("histogram", # 1 in 4.4 x 10 ^ 96 colname <- paste(base:::sample(c(letters, LETTERS), size = min(max(nchar(colnames(df))) + 1, 100), - replace=TRUE), - collapse="") + replace = TRUE), + collapse = "") # Append the given column to the dataset. This is to support Columns that # don't belong to the DataFrame but are rather expressions From fc2f6a31166ac895b5c2ce05074f5c7edf372706 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Tue, 26 Apr 2016 14:57:57 -0700 Subject: [PATCH 21/22] Changes DataFrame for SparkDataFrame --- R/pkg/R/DataFrame.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 577089c7c3c4f..36aedfae86b33 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2474,7 +2474,7 @@ setMethod("drop", #' @name histogram #' @title Histogram #' @param nbins the number of bins (optional). Default value is 10. -#' @param df the DataFrame containing the Column to build the histogram from. +#' @param df the SparkDataFrame containing the Column to build the histogram from. #' @param colname the name of the column to build the histogram from. #' @return a data.frame with the histogram statistics, i.e., counts and centroids. #' @rdname histogram @@ -2483,7 +2483,7 @@ setMethod("drop", #' @examples #' \dontrun{ #' -#' # Create a DataFrame from the Iris dataset +#' # Create a SparkDataFrame from the Iris dataset #' irisDF <- createDataFrame(sqlContext, iris) #' #' # Compute histogram statistics @@ -2516,7 +2516,7 @@ setMethod("histogram", colname <- col x <- if (class(col) == "character") { if (!colname %in% names(df)) { - stop("Specified colname does not belong to the given DataFrame.") + stop("Specified colname does not belong to the given SparkDataFrame.") } # Filter NA values in the target column and remove all other columns @@ -2542,7 +2542,7 @@ setMethod("histogram", collapse = "") # Append the given column to the dataset. This is to support Columns that - # don't belong to the DataFrame but are rather expressions + # don't belong to the SparkDataFrame but are rather expressions df <- withColumn(df, colname, col) # Filter NA values in the target column. Cannot remove all other columns From 838c9155839bbb7fd4d5f855a9d88ae68fef2ffb Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Tue, 26 Apr 2016 15:18:06 -0700 Subject: [PATCH 22/22] Changed error message on histogram tests --- R/pkg/inst/tests/testthat/test_sparkSQL.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 9e073d4b9ffc7..336068035eaf8 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -2003,7 +2003,7 @@ test_that("Histogram", { # Wrong colname expect_error(histogram(irisDF, "xxx"), - "Specified colname does not belong to the given DataFrame.") + "Specified colname does not belong to the given SparkDataFrame.") # Invalid nbins expect_error(histogram(irisDF, "Petal_Width", nbins = 0),