From efc2f6634b54cd91e4946d4d4e04be769769f4ad Mon Sep 17 00:00:00 2001
From: "Oscar D. Lara Yejas" <odlaraye@oscars-mbp.usca.ibm.com>
Date: Mon, 7 Mar 2016 16:19:07 -0800
Subject: [PATCH 01/22] Added histogram function

---
 R/pkg/NAMESPACE                           |   1 +
 R/pkg/R/functions.R                       | 111 ++++++++++++++++++++++
 R/pkg/R/generics.R                        |   4 +
 R/pkg/inst/tests/testthat/test_sparkSQL.R |  28 ++++++
 4 files changed, 144 insertions(+)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 6a3d63f43f785..ccaee147a227e 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -164,6 +164,7 @@ exportMethods("%in%",
               "getItem",
               "greatest",
               "hex",
+              "histogram",
               "hour",
               "hypot",
               "ifelse",
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index e5521f3cffadf..711489cdf586d 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -2638,3 +2638,114 @@ setMethod("sort_array",
             jc <- callJStatic("org.apache.spark.sql.functions", "sort_array", x@jc, asc)
             column(jc)
           })
+
+#' This function renders a histogram for a given SparkR Column. Note: This function depends on the ggplot2 package.
+#' 
+#' @name histogram
+#' @title Histogram
+#' @param nbins the number of bins (optional). The default is 10.
+#' @param df the DataFrame containing the Column to build the histogram from.
+#' @param colname the name of the column to build the histogram from.
+#' @return a ggplot object with the histogram
+#' @examples \dontrun{
+#' 
+#' # Create a DataFrame from the Irijjs dataset
+#' irisDF <- createDataFrame(sqlContext, iris)
+#' 
+#' # Render a histogram for the Sepal_Length column 
+#' histogram(irisDF, "Sepal_Length", nbins=12)
+#' }
+# TODO: the name of this function will be changed to hist to match R's after SPARK-9325 is fixed
+setMethod("histogram",
+          signature(df="DataFrame"),
+          function (df, colname, nbins) {
+            # Require ggplot library
+            require(ggplot2)
+
+            # Compute statistics for the histogram
+            if (missing(nbins)) {
+              histData <- hist.stats(df, colname)
+            } else {
+              histData <- hist.stats(df, colname, nbins)
+            }
+
+            # Get the target column name
+            targetColname <- histData[[1]]
+
+            # Get the input data for the plot
+            histStats <- histData[[2]]
+
+            # Render the plot from the stats. Add axis labels.
+            plot <- ggplot(histStats, aes(x = centroids, y = counts))
+            plot <- plot + geom_histogram(data = histStats, stat = "identity", binwidth = 100)
+            plot <- plot + xlab(targetColname) + ylab("Frequency")
+
+            return(plot)
+         })
+
+#' This function computes statistics to render a histogram. These are 
+#' centroids and counts.
+#' 
+#' @name hist.stats
+#' @title Compute histogram statistics
+#' @param formula a formula or a bigr.vector specifying the data source
+#' @param nbins the number of bins (optional)
+#' @param data optional bigr.frame. If specified, this serves as the 
+#'   environment for column references in the formula that don't have an
+#'   explicit bigr.frame.
+#' @return a data.frame with the computed statistics
+#' @seealso \link{bigr.histogram}
+hist.stats <- function(df, colname=NULL, nbins=10) {
+  # Validate nbins
+  if (nbins < 2) {
+    stop("The number of bins must be a positive integer number greater than 1.")
+  }
+
+  # Validate colname
+  if (is.null(colname) | is.na(colname)) {
+    stop("colname must be specified.")
+  }
+  if (!colname %in% colnames(df)) {
+    stop("Specified colname does not belong to the given DataFrame.")
+  }
+
+  # Filter null values (i.e., NA's) in all grouping columns as well as in the target column
+  df <- na.omit(df[, colname])
+
+  # TODO: This will be when improved SPARK-9325 or SPARK-13436 are fixed
+  x <- eval(parse(text=paste0("df$", colname)))
+
+  stats <- collect(describe(df[, colname]))
+  min <- as.numeric(stats[4,2])
+  max <- as.numeric(stats[5,2])
+
+  # Normalize the data
+  xnorm <- (x - min) / (max - min)
+
+  # Round the data to 4 significant digits. This is to avoid rounding issues.
+  xnorm <- cast(xnorm * 10000, "integer") / 10000.0
+
+  # Since min = 0, max = 1 (data is already normalized)
+  normBinSize <- 1 / nbins
+  binsize <- (max - min) / nbins
+  approxBins <- xnorm / normBinSize
+
+  # Adjust values that are equal to the upper bound of each bin
+  bins <- cast(approxBins -
+                     ifelse(approxBins == cast(approxBins, "integer") & x != min, 1, 0), "integer")
+
+  df$bins <- bins
+  histStats <- collect(count(groupBy(df, "bins")))
+  names(histStats) <- c("bins", "counts")
+
+  # Fill bins with zero counts
+  y <- data.frame("bins"=seq(0, nbins - 1))
+  histStats <- merge(histStats, y, all.x=T, all.y=T)
+  histStats[is.na(histStats$count), 2] <- 0
+
+  # Compute centroids
+  histStats$centroids <- histStats$bins * binsize + min + binsize / 2
+
+  # Return the statistics
+  return(list(colname, histStats))
+}
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index ab61bce03df23..055321184d3f9 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -99,6 +99,10 @@ setGeneric("getJRDD", function(rdd, ...) { standardGeneric("getJRDD") })
 # @export
 setGeneric("glom", function(x) { standardGeneric("glom") })
 
+# @rdname histogram
+# @export
+setGeneric("histogram", function(df, colname, nbins) { standardGeneric("histogram") })
+
 # @rdname keyBy
 # @export
 setGeneric("keyBy", function(x, func) { standardGeneric("keyBy") })
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index cc118108f61cc..2b9b1ab5abe97 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -1897,6 +1897,34 @@ test_that("Method str()", {
   expect_equal(capture.output(utils:::str(iris)), capture.output(str(iris)))
 })
 
+test_that("Histogram", {
+
+  # Basic histogram test
+  expect_equal(
+    all(histogram(irisDF, "Petal_Width", 8)$data ==
+        data.frame(bins=seq(0, 7),
+                   counts=c(48, 2, 7, 21, 24, 19, 15, 14),
+                   centroids=seq(0,7) * 0.3 + 0.25)),
+        TRUE)
+
+  # Missing nbins
+  expect_equal(length(histogram(irisDF, "Petal_Width")$data$counts), 10)
+
+  # Wrong colname
+  expect_error(histogram(irisDF, "xxx"),
+               "Specified colname does not belong to the given DataFrame.")
+
+  # Invalid nbins
+  expect_error(histogram(irisDF, "Petal_Width", nbins=0),
+               "The number of bins must be a positive integer number greater than 1.")
+
+  # Test against R's hist
+  expect_equal(all(hist(iris$Sepal.Width)$counts ==
+                   histogram(irisDF, "Sepal_Width", 12)$data$counts), T)
+
+  # Test when there are zero counts
+  df <- as.DataFrame(sqlContext, data.frame(x=c(1,2,3,4,100)))
+})
 unlink(parquetPath)
 unlink(jsonPath)
 unlink(jsonPathNa)

From 0ad424bbcd03bf4c57566dbe92e537db213ba187 Mon Sep 17 00:00:00 2001
From: "Oscar D. Lara Yejas" <odlaraye@oscars-mbp.usca.ibm.com>
Date: Mon, 7 Mar 2016 16:43:46 -0800
Subject: [PATCH 02/22] Added test case where some bins have zero counts

---
 R/pkg/inst/tests/testthat/test_sparkSQL.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index 2b9b1ab5abe97..999671f3d3548 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -1924,6 +1924,7 @@ test_that("Histogram", {
 
   # Test when there are zero counts
   df <- as.DataFrame(sqlContext, data.frame(x=c(1,2,3,4,100)))
+  expect_equal(histogram(df, "x")$data$counts, c(4, 0, 0, 0, 0, 0, 0, 0, 0, 1))
 })
 unlink(parquetPath)
 unlink(jsonPath)

From d19992b4ec5141221cbf8724dc592b09e541039b Mon Sep 17 00:00:00 2001
From: "Oscar D. Lara Yejas" <odlaraye@oscars-mbp.usca.ibm.com>
Date: Mon, 7 Mar 2016 17:25:15 -0800
Subject: [PATCH 03/22] Added check for ggplot2 package

---
 R/pkg/inst/tests/testthat/test_sparkSQL.R | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index 999671f3d3548..793c55131672e 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -1899,6 +1899,12 @@ test_that("Method str()", {
 
 test_that("Histogram", {
 
+  # If ggplot2 is not installed, install it
+  if (!("ggplot2" %in% installed.packages()[, 1])) {
+    install.packages("ggplot2", repos = "http://cran.us.r-project.org")
+  }
+  library(ggplot2)
+
   # Basic histogram test
   expect_equal(
     all(histogram(irisDF, "Petal_Width", 8)$data ==

From ac8f4c9ca56b592c32c60dc945023050df89bdb4 Mon Sep 17 00:00:00 2001
From: "Oscar D. Lara Yejas" <odlaraye@oscars-mbp.attlocal.net>
Date: Mon, 7 Mar 2016 19:13:36 -0800
Subject: [PATCH 04/22] Suppressed warnings for loading ggplot

---
 R/pkg/inst/tests/testthat/test_sparkSQL.R | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index 793c55131672e..c8e5782329ea9 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -1899,11 +1899,13 @@ test_that("Method str()", {
 
 test_that("Histogram", {
 
-  # If ggplot2 is not installed, install it
-  if (!("ggplot2" %in% installed.packages()[, 1])) {
-    install.packages("ggplot2", repos = "http://cran.us.r-project.org")
-  }
-  library(ggplot2)
+  suppressWarnings({
+    # If ggplot2 is not installed, install it
+    if (!("ggplot2" %in% installed.packages()[, 1])) {
+      install.packages("ggplot2", repos = "http://cran.us.r-project.org")
+    }
+    library(ggplot2)
+  })
 
   # Basic histogram test
   expect_equal(

From 125b82dd2b58490ed66b9d8dd6855c8db8faee8d Mon Sep 17 00:00:00 2001
From: "Oscar D. Lara Yejas" <odlaraye@oscars-mbp.usca.ibm.com>
Date: Mon, 21 Mar 2016 17:14:21 -0700
Subject: [PATCH 05/22] Modified histogram to remove ggplot2 dependency

---
 R/pkg/DESCRIPTION                         |   5 +-
 R/pkg/R/functions.R                       | 153 +++++++++-------------
 R/pkg/R/generics.R                        |   2 +-
 R/pkg/inst/tests/testthat/test_sparkSQL.R |  16 +--
 4 files changed, 68 insertions(+), 108 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 465bc37788e5d..0cd0d75df0f70 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -18,10 +18,10 @@ Collate:
     'schema.R'
     'generics.R'
     'jobj.R'
-    'RDD.R'
-    'pairRDD.R'
     'column.R'
     'group.R'
+    'RDD.R'
+    'pairRDD.R'
     'DataFrame.R'
     'SQLContext.R'
     'backend.R'
@@ -36,3 +36,4 @@ Collate:
     'stats.R'
     'types.R'
     'utils.R'
+RoxygenNote: 5.0.1
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 711489cdf586d..c413d5e4c2d4f 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -2639,113 +2639,80 @@ setMethod("sort_array",
             column(jc)
           })
 
-#' This function renders a histogram for a given SparkR Column. Note: This function depends on the ggplot2 package.
+#' This function computes a histogram for a given SparkR Column.
 #' 
 #' @name histogram
 #' @title Histogram
 #' @param nbins the number of bins (optional). The default is 10.
 #' @param df the DataFrame containing the Column to build the histogram from.
 #' @param colname the name of the column to build the histogram from.
-#' @return a ggplot object with the histogram
+#' @return a data.frame with the histogram statistics, i.e., counts and centroids.
 #' @examples \dontrun{
 #' 
-#' # Create a DataFrame from the Irijjs dataset
+#' # Create a DataFrame from the Iris dataset
 #' irisDF <- createDataFrame(sqlContext, iris)
 #' 
-#' # Render a histogram for the Sepal_Length column 
-#' histogram(irisDF, "Sepal_Length", nbins=12)
-#' }
-# TODO: the name of this function will be changed to hist to match R's after SPARK-9325 is fixed
+#' # Compute histogram statistics
+#' histData <- histogram(df, "colname"Sepal_Length", nbins=12)
+#'
+#' # Once SparkR has computed the histogram statistics, it would be very easy to
+#' # render the histogram using R's visualization packages such as ggplot2.
+#'   
+#' } 
 setMethod("histogram",
-          signature(df="DataFrame"),
-          function (df, colname, nbins) {
-            # Require ggplot library
-            require(ggplot2)
-
-            # Compute statistics for the histogram
-            if (missing(nbins)) {
-              histData <- hist.stats(df, colname)
-            } else {
-              histData <- hist.stats(df, colname, nbins)
+          signature(df = "DataFrame"),
+          function(df, colname, nbins=10) {
+            # Validate nbins
+            if (nbins < 2) {
+              stop("The number of bins must be a positive integer number greater than 1.")
             }
 
-            # Get the target column name
-            targetColname <- histData[[1]]
+            # Validate colname
+            if (is.null(colname) | is.na(colname)) {
+              stop("colname must be specified.")
+            }
+            if (!colname %in% names(df)) {
+              stop("Specified colname does not belong to the given DataFrame.")
+            }
 
-            # Get the input data for the plot
-            histStats <- histData[[2]]
+            # Filter NA values in the target column
+            df <- na.omit(df[, colname])
 
-            # Render the plot from the stats. Add axis labels.
-            plot <- ggplot(histStats, aes(x = centroids, y = counts))
-            plot <- plot + geom_histogram(data = histStats, stat = "identity", binwidth = 100)
-            plot <- plot + xlab(targetColname) + ylab("Frequency")
+            # TODO: This will be when improved SPARK-9325 or SPARK-13436 are fixed
+            x <- eval(parse(text=paste0("df$", colname)))
 
-            return(plot)
-         })
+            stats <- collect(describe(df[, colname]))
+            min <- as.numeric(stats[4,2])
+            max <- as.numeric(stats[5,2])
 
-#' This function computes statistics to render a histogram. These are 
-#' centroids and counts.
-#' 
-#' @name hist.stats
-#' @title Compute histogram statistics
-#' @param formula a formula or a bigr.vector specifying the data source
-#' @param nbins the number of bins (optional)
-#' @param data optional bigr.frame. If specified, this serves as the 
-#'   environment for column references in the formula that don't have an
-#'   explicit bigr.frame.
-#' @return a data.frame with the computed statistics
-#' @seealso \link{bigr.histogram}
-hist.stats <- function(df, colname=NULL, nbins=10) {
-  # Validate nbins
-  if (nbins < 2) {
-    stop("The number of bins must be a positive integer number greater than 1.")
-  }
-
-  # Validate colname
-  if (is.null(colname) | is.na(colname)) {
-    stop("colname must be specified.")
-  }
-  if (!colname %in% colnames(df)) {
-    stop("Specified colname does not belong to the given DataFrame.")
-  }
-
-  # Filter null values (i.e., NA's) in all grouping columns as well as in the target column
-  df <- na.omit(df[, colname])
-
-  # TODO: This will be when improved SPARK-9325 or SPARK-13436 are fixed
-  x <- eval(parse(text=paste0("df$", colname)))
-
-  stats <- collect(describe(df[, colname]))
-  min <- as.numeric(stats[4,2])
-  max <- as.numeric(stats[5,2])
-
-  # Normalize the data
-  xnorm <- (x - min) / (max - min)
-
-  # Round the data to 4 significant digits. This is to avoid rounding issues.
-  xnorm <- cast(xnorm * 10000, "integer") / 10000.0
-
-  # Since min = 0, max = 1 (data is already normalized)
-  normBinSize <- 1 / nbins
-  binsize <- (max - min) / nbins
-  approxBins <- xnorm / normBinSize
-
-  # Adjust values that are equal to the upper bound of each bin
-  bins <- cast(approxBins -
-                     ifelse(approxBins == cast(approxBins, "integer") & x != min, 1, 0), "integer")
-
-  df$bins <- bins
-  histStats <- collect(count(groupBy(df, "bins")))
-  names(histStats) <- c("bins", "counts")
-
-  # Fill bins with zero counts
-  y <- data.frame("bins"=seq(0, nbins - 1))
-  histStats <- merge(histStats, y, all.x=T, all.y=T)
-  histStats[is.na(histStats$count), 2] <- 0
-
-  # Compute centroids
-  histStats$centroids <- histStats$bins * binsize + min + binsize / 2
-
-  # Return the statistics
-  return(list(colname, histStats))
-}
+            # Normalize the data
+            xnorm <- (x - min) / (max - min)
+
+            # Round the data to 4 significant digits. This is to avoid rounding issues.
+            xnorm <- cast(xnorm * 10000, "integer") / 10000.0
+
+            # Since min = 0, max = 1 (data is already normalized)
+            normBinSize <- 1 / nbins
+            binsize <- (max - min) / nbins
+            approxBins <- xnorm / normBinSize
+
+            # Adjust values that are equal to the upper bound of each bin
+            bins <- cast(approxBins -
+                         ifelse(approxBins == cast(approxBins, "integer") & x != min, 1, 0),
+                         "integer")
+
+            df$bins <- bins
+            histStats <- collect(count(groupBy(df, "bins")))
+            names(histStats) <- c("bins", "counts")
+
+            # Fill bins with zero counts
+            y <- data.frame("bins"=seq(0, nbins - 1))
+            histStats <- merge(histStats, y, all.x=T, all.y=T)
+            histStats[is.na(histStats$count), 2] <- 0
+
+            # Compute centroids
+            histStats$centroids <- histStats$bins * binsize + min + binsize / 2
+
+            # Return the statistics
+            return(histStats)
+         })
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 055321184d3f9..6bc1042323a6b 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -101,7 +101,7 @@ setGeneric("glom", function(x) { standardGeneric("glom") })
 
 # @rdname histogram
 # @export
-setGeneric("histogram", function(df, colname, nbins) { standardGeneric("histogram") })
+setGeneric("histogram", function(df, colname, nbins=10) { standardGeneric("histogram") })
 
 # @rdname keyBy
 # @export
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index c8e5782329ea9..17b01055711a7 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -1899,24 +1899,16 @@ test_that("Method str()", {
 
 test_that("Histogram", {
 
-  suppressWarnings({
-    # If ggplot2 is not installed, install it
-    if (!("ggplot2" %in% installed.packages()[, 1])) {
-      install.packages("ggplot2", repos = "http://cran.us.r-project.org")
-    }
-    library(ggplot2)
-  })
-
   # Basic histogram test
   expect_equal(
-    all(histogram(irisDF, "Petal_Width", 8)$data ==
+    all(histogram(irisDF, "Petal_Width", 8) ==
         data.frame(bins=seq(0, 7),
                    counts=c(48, 2, 7, 21, 24, 19, 15, 14),
                    centroids=seq(0,7) * 0.3 + 0.25)),
         TRUE)
 
   # Missing nbins
-  expect_equal(length(histogram(irisDF, "Petal_Width")$data$counts), 10)
+  expect_equal(length(histogram(irisDF, "Petal_Width")$counts), 10)
 
   # Wrong colname
   expect_error(histogram(irisDF, "xxx"),
@@ -1928,11 +1920,11 @@ test_that("Histogram", {
 
   # Test against R's hist
   expect_equal(all(hist(iris$Sepal.Width)$counts ==
-                   histogram(irisDF, "Sepal_Width", 12)$data$counts), T)
+                   histogram(irisDF, "Sepal_Width", 12)$counts), T)
 
   # Test when there are zero counts
   df <- as.DataFrame(sqlContext, data.frame(x=c(1,2,3,4,100)))
-  expect_equal(histogram(df, "x")$data$counts, c(4, 0, 0, 0, 0, 0, 0, 0, 0, 1))
+  expect_equal(histogram(df, "x")$counts, c(4, 0, 0, 0, 0, 0, 0, 0, 0, 1))
 })
 unlink(parquetPath)
 unlink(jsonPath)

From 971d3060f411c0d481ef9258a1c27a17d949fa3f Mon Sep 17 00:00:00 2001
From: "Oscar D. Lara Yejas" <odlaraye@oscars-mbp.usca.ibm.com>
Date: Tue, 22 Mar 2016 15:26:13 -0700
Subject: [PATCH 06/22] Fixed style issues

---
 R/pkg/R/functions.R                       | 10 +++++-----
 R/pkg/inst/tests/testthat/test_sparkSQL.R | 10 +++++-----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index c413d5e4c2d4f..0ff0489548f06 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -2653,7 +2653,7 @@ setMethod("sort_array",
 #' irisDF <- createDataFrame(sqlContext, iris)
 #' 
 #' # Compute histogram statistics
-#' histData <- histogram(df, "colname"Sepal_Length", nbins=12)
+#' histData <- histogram(df, "colname"Sepal_Length", nbins = 12)
 #'
 #' # Once SparkR has computed the histogram statistics, it would be very easy to
 #' # render the histogram using R's visualization packages such as ggplot2.
@@ -2661,7 +2661,7 @@ setMethod("sort_array",
 #' } 
 setMethod("histogram",
           signature(df = "DataFrame"),
-          function(df, colname, nbins=10) {
+          function(df, colname, nbins = 10) {
             # Validate nbins
             if (nbins < 2) {
               stop("The number of bins must be a positive integer number greater than 1.")
@@ -2679,7 +2679,7 @@ setMethod("histogram",
             df <- na.omit(df[, colname])
 
             # TODO: This will be when improved SPARK-9325 or SPARK-13436 are fixed
-            x <- eval(parse(text=paste0("df$", colname)))
+            x <- eval(parse(text = paste0("df$", colname)))
 
             stats <- collect(describe(df[, colname]))
             min <- as.numeric(stats[4,2])
@@ -2706,8 +2706,8 @@ setMethod("histogram",
             names(histStats) <- c("bins", "counts")
 
             # Fill bins with zero counts
-            y <- data.frame("bins"=seq(0, nbins - 1))
-            histStats <- merge(histStats, y, all.x=T, all.y=T)
+            y <- data.frame("bins" = seq(0, nbins - 1))
+            histStats <- merge(histStats, y, all.x = T, all.y = T)
             histStats[is.na(histStats$count), 2] <- 0
 
             # Compute centroids
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index 17b01055711a7..3e459ab113b8b 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -1902,9 +1902,9 @@ test_that("Histogram", {
   # Basic histogram test
   expect_equal(
     all(histogram(irisDF, "Petal_Width", 8) ==
-        data.frame(bins=seq(0, 7),
-                   counts=c(48, 2, 7, 21, 24, 19, 15, 14),
-                   centroids=seq(0,7) * 0.3 + 0.25)),
+        data.frame(bins = seq(0, 7),
+                   counts = c(48, 2, 7, 21, 24, 19, 15, 14),
+                   centroids = seq(0,7) * 0.3 + 0.25)),
         TRUE)
 
   # Missing nbins
@@ -1915,7 +1915,7 @@ test_that("Histogram", {
                "Specified colname does not belong to the given DataFrame.")
 
   # Invalid nbins
-  expect_error(histogram(irisDF, "Petal_Width", nbins=0),
+  expect_error(histogram(irisDF, "Petal_Width", nbins = 0),
                "The number of bins must be a positive integer number greater than 1.")
 
   # Test against R's hist
@@ -1923,7 +1923,7 @@ test_that("Histogram", {
                    histogram(irisDF, "Sepal_Width", 12)$counts), T)
 
   # Test when there are zero counts
-  df <- as.DataFrame(sqlContext, data.frame(x=c(1,2,3,4,100)))
+  df <- as.DataFrame(sqlContext, data.frame(x = c(1,2,3,4,100)))
   expect_equal(histogram(df, "x")$counts, c(4, 0, 0, 0, 0, 0, 0, 0, 0, 1))
 })
 unlink(parquetPath)

From c06344efa7508c72871cf1c48c3a2246fc4475d5 Mon Sep 17 00:00:00 2001
From: "Oscar D. Lara Yejas" <odlaraye@oscars-mbp.usca.ibm.com>
Date: Tue, 22 Mar 2016 15:57:49 -0700
Subject: [PATCH 07/22] Fixed style issues

---
 R/pkg/R/functions.R                       | 4 ++--
 R/pkg/inst/tests/testthat/test_sparkSQL.R | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 0ff0489548f06..284634b148aa7 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -2682,8 +2682,8 @@ setMethod("histogram",
             x <- eval(parse(text = paste0("df$", colname)))
 
             stats <- collect(describe(df[, colname]))
-            min <- as.numeric(stats[4,2])
-            max <- as.numeric(stats[5,2])
+            min <- as.numeric(stats[4, 2])
+            max <- as.numeric(stats[5, 2])
 
             # Normalize the data
             xnorm <- (x - min) / (max - min)
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index 3e459ab113b8b..0407430b32d04 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -1904,7 +1904,7 @@ test_that("Histogram", {
     all(histogram(irisDF, "Petal_Width", 8) ==
         data.frame(bins = seq(0, 7),
                    counts = c(48, 2, 7, 21, 24, 19, 15, 14),
-                   centroids = seq(0,7) * 0.3 + 0.25)),
+                   centroids = seq(0, 7) * 0.3 + 0.25)),
         TRUE)
 
   # Missing nbins
@@ -1923,7 +1923,7 @@ test_that("Histogram", {
                    histogram(irisDF, "Sepal_Width", 12)$counts), T)
 
   # Test when there are zero counts
-  df <- as.DataFrame(sqlContext, data.frame(x = c(1,2,3,4,100)))
+  df <- as.DataFrame(sqlContext, data.frame(x = c(1, 2, 3, 4, 100)))
   expect_equal(histogram(df, "x")$counts, c(4, 0, 0, 0, 0, 0, 0, 0, 0, 1))
 })
 unlink(parquetPath)

From 468adbf4ce1a4b367b8b8513d0b8623d23c79e60 Mon Sep 17 00:00:00 2001
From: "Oscar D. Lara Yejas" <odlaraye@oscars-mbp.usca.ibm.com>
Date: Wed, 23 Mar 2016 17:28:14 -0700
Subject: [PATCH 08/22] Added example to render the histogram with ggplot2, and
 added documentation tags

---
 R/pkg/R/functions.R | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 284634b148aa7..299f75c00a769 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -2643,21 +2643,28 @@ setMethod("sort_array",
 #' 
 #' @name histogram
 #' @title Histogram
-#' @param nbins the number of bins (optional). The default is 10.
+#' @param nbins the number of bins (optional). Default value is 10.
 #' @param df the DataFrame containing the Column to build the histogram from.
 #' @param colname the name of the column to build the histogram from.
 #' @return a data.frame with the histogram statistics, i.e., counts and centroids.
-#' @examples \dontrun{
-#' 
+#' @rdname histogram
+#' @family agg_funcs
+#' @export
+#' @examples 
+#' \dontrun{
 #' # Create a DataFrame from the Iris dataset
 #' irisDF <- createDataFrame(sqlContext, iris)
 #' 
 #' # Compute histogram statistics
 #' histData <- histogram(df, "colname"Sepal_Length", nbins = 12)
 #'
-#' # Once SparkR has computed the histogram statistics, it would be very easy to
-#' # render the histogram using R's visualization packages such as ggplot2.
-#'   
+#' # Once SparkR has computed the histogram statistics, the histogram can be
+#' # rendered using the ggplot2 library:
+#'
+#' require(ggplot2)
+#' plot <- ggplot(histStats, aes(x = centroids, y = counts))
+#' plot <- plot + geom_histogram(data = histStats, stat = "identity", binwidth = 100)
+#' plot <- plot + xlab("Sepal_Length") + ylab("Frequency")   
 #' } 
 setMethod("histogram",
           signature(df = "DataFrame"),

From dbc9d75584b7ae3ab952c7a88b19b21ad00a5a82 Mon Sep 17 00:00:00 2001
From: "Oscar D. Lara Yejas" <odlaraye@oscars-mbp.usca.ibm.com>
Date: Thu, 24 Mar 2016 09:46:42 -0700
Subject: [PATCH 09/22] Round nbins to the smallest integer

---
 R/pkg/R/functions.R | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 299f75c00a769..ab4ed74733b3e 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -2674,6 +2674,9 @@ setMethod("histogram",
               stop("The number of bins must be a positive integer number greater than 1.")
             }
 
+            # Round nbins to the smallest integer
+            nbins <- floor(nbins)
+
             # Validate colname
             if (is.null(colname) | is.na(colname)) {
               stop("colname must be specified.")

From 19f995c8f72efb58b818f81a122529680c24f5ec Mon Sep 17 00:00:00 2001
From: "Oscar D. Lara Yejas" <odlaraye@oscars-mbp.usca.ibm.com>
Date: Fri, 25 Mar 2016 16:48:33 -0700
Subject: [PATCH 10/22] Added support for Columns

---
 R/pkg/R/functions.R                       | 35 ++++++++++++++---------
 R/pkg/R/generics.R                        |  2 +-
 R/pkg/inst/tests/testthat/test_sparkSQL.R | 18 +++++++++++-
 3 files changed, 40 insertions(+), 15 deletions(-)

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index ab4ed74733b3e..d49765c8ce9b6 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -2667,8 +2667,8 @@ setMethod("sort_array",
 #' plot <- plot + xlab("Sepal_Length") + ylab("Frequency")   
 #' } 
 setMethod("histogram",
-          signature(df = "DataFrame"),
-          function(df, colname, nbins = 10) {
+          signature(df = "DataFrame", col="characterOrColumn"),
+          function(df, col, nbins = 10) {
             # Validate nbins
             if (nbins < 2) {
               stop("The number of bins must be a positive integer number greater than 1.")
@@ -2677,19 +2677,28 @@ setMethod("histogram",
             # Round nbins to the smallest integer
             nbins <- floor(nbins)
 
-            # Validate colname
-            if (is.null(colname) | is.na(colname)) {
-              stop("colname must be specified.")
+            # Validate col
+            if (is.null(col)) {
+              stop("col must be specified.")
             }
-            if (!colname %in% names(df)) {
-              stop("Specified colname does not belong to the given DataFrame.")
-            }
-
-            # Filter NA values in the target column
-            df <- na.omit(df[, colname])
 
-            # TODO: This will be when improved SPARK-9325 or SPARK-13436 are fixed
-            x <- eval(parse(text = paste0("df$", colname)))
+            colname <- col
+            x <- if (class(col) == "character") {
+              if (!colname %in% names(df)) {
+                stop("Specified colname does not belong to the given DataFrame.")
+              }
+
+              # Filter NA values in the target column
+              df <- na.omit(df[, colname])
+
+              # TODO: This will be when improved SPARK-9325 or SPARK-13436 are fixed
+              eval(parse(text = paste0("df$", colname)))
+            } else if (class(col) == "Column") {
+              # Append the given column to the dataset
+              df$x <- col
+              colname <- "x"
+              col
+            }
 
             stats <- collect(describe(df[, colname]))
             min <- as.numeric(stats[4, 2])
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 6bc1042323a6b..323b4f3d307b4 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -101,7 +101,7 @@ setGeneric("glom", function(x) { standardGeneric("glom") })
 
 # @rdname histogram
 # @export
-setGeneric("histogram", function(df, colname, nbins=10) { standardGeneric("histogram") })
+setGeneric("histogram", function(df, col, nbins=10) { standardGeneric("histogram") })
 
 # @rdname keyBy
 # @export
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index 0407430b32d04..9bd6d79f91127 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -1899,7 +1899,7 @@ test_that("Method str()", {
 
 test_that("Histogram", {
 
-  # Basic histogram test
+  # Basic histogram test with colname
   expect_equal(
     all(histogram(irisDF, "Petal_Width", 8) ==
         data.frame(bins = seq(0, 7),
@@ -1907,6 +1907,22 @@ test_that("Histogram", {
                    centroids = seq(0, 7) * 0.3 + 0.25)),
         TRUE)
 
+  # Basic histogram test with Column
+  expect_equal(
+    all(histogram(irisDF, irisDF$Petal_Width, 8) ==
+          data.frame(bins = seq(0, 7),
+                     counts = c(48, 2, 7, 21, 24, 19, 15, 14),
+                     centroids = seq(0, 7) * 0.3 + 0.25)),
+    TRUE)
+
+  # Basic histogram test with derived column
+  expect_equal(
+    all(round(histogram(irisDF, irisDF$Petal_Width + 1, 8), 2) ==
+          data.frame(bins = seq(0, 7),
+                     counts = c(48, 2, 7, 21, 24, 19, 15, 14),
+                     centroids = seq(0, 7) * 0.3 + 1.25)),
+    TRUE)
+
   # Missing nbins
   expect_equal(length(histogram(irisDF, "Petal_Width")$counts), 10)
 

From 2800492e307253d7f6004944b2e4beb11f76c330 Mon Sep 17 00:00:00 2001
From: "Oscar D. Lara Yejas" <odlaraye@oscars-mbp.usca.ibm.com>
Date: Fri, 25 Mar 2016 16:53:43 -0700
Subject: [PATCH 11/22] Fixed style

---
 R/pkg/R/functions.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index d49765c8ce9b6..e81805262d919 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -2667,7 +2667,7 @@ setMethod("sort_array",
 #' plot <- plot + xlab("Sepal_Length") + ylab("Frequency")   
 #' } 
 setMethod("histogram",
-          signature(df = "DataFrame", col="characterOrColumn"),
+          signature(df = "DataFrame", col = "characterOrColumn"),
           function(df, col, nbins = 10) {
             # Validate nbins
             if (nbins < 2) {

From adc34461a869d4b4c072952b999c896047e994d6 Mon Sep 17 00:00:00 2001
From: "Oscar D. Lara Yejas" <odlaraye@oscars-mbp.usca.ibm.com>
Date: Tue, 19 Apr 2016 11:09:58 -0700
Subject: [PATCH 12/22] Removed Roxygen auto generated comment

---
 R/pkg/DESCRIPTION | 1 -
 1 file changed, 1 deletion(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 0cd0d75df0f70..bea3ed80e4166 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -36,4 +36,3 @@ Collate:
     'stats.R'
     'types.R'
     'utils.R'
-RoxygenNote: 5.0.1

From 046b7dad841bbf13d1d4b93bf001474f74b25865 Mon Sep 17 00:00:00 2001
From: "Oscar D. Lara Yejas" <odlaraye@oscars-mbp.usca.ibm.com>
Date: Tue, 19 Apr 2016 14:14:11 -0700
Subject: [PATCH 13/22] Added na.omit to the case when input is a Column

---
 R/pkg/R/functions.R | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index e81805262d919..3ae29a7990ef0 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -2688,18 +2688,26 @@ setMethod("histogram",
                 stop("Specified colname does not belong to the given DataFrame.")
               }
 
-              # Filter NA values in the target column
+              # Filter NA values in the target column and remove all other columns
               df <- na.omit(df[, colname])
 
               # TODO: This will be when improved SPARK-9325 or SPARK-13436 are fixed
-              eval(parse(text = paste0("df$", colname)))
+              getColumn(df, colname)
+              #eval(parse(text = paste0("df$", colname)))
             } else if (class(col) == "Column") {
-              # Append the given column to the dataset
+              # Append the given column to the dataset. This is to support Columns that
+              # don't belong to the DataFrame but are rather expressions
               df$x <- col
+
+              # Filter NA values in the target column. Cannot remove all other columns
+              # since given Column may be an expression on one or more existing columns
+              df <- na.omit(df)
+
               colname <- "x"
               col
             }
 
+            # At this point, df only has one column: the one to compute the histogram from
             stats <- collect(describe(df[, colname]))
             min <- as.numeric(stats[4, 2])
             max <- as.numeric(stats[5, 2])

From b03c335a6dc9818f54fc2633fb149f9f3ad0277d Mon Sep 17 00:00:00 2001
From: "Oscar D. Lara Yejas" <odlaraye@oscars-mbp.usca.ibm.com>
Date: Tue, 19 Apr 2016 14:22:54 -0700
Subject: [PATCH 14/22] Added na.omit to the case when input is a Column

---
 R/pkg/R/functions.R | 1 -
 1 file changed, 1 deletion(-)

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 3ae29a7990ef0..f64f078f3de13 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -2693,7 +2693,6 @@ setMethod("histogram",
 
               # TODO: This will be when improved SPARK-9325 or SPARK-13436 are fixed
               getColumn(df, colname)
-              #eval(parse(text = paste0("df$", colname)))
             } else if (class(col) == "Column") {
               # Append the given column to the dataset. This is to support Columns that
               # don't belong to the DataFrame but are rather expressions

From c2c4601b1a09e23e5b1be64fb027b92f9638da20 Mon Sep 17 00:00:00 2001
From: "Oscar D. Lara Yejas" <odlaraye@oscars-mbp.usca.ibm.com>
Date: Thu, 21 Apr 2016 13:55:13 -0700
Subject: [PATCH 15/22] Moved histogram function to DataFrame.R

---
 R/pkg/R/DataFrame.R | 104 ++++++++++++++++++++++++++++++++++++++++++++
 R/pkg/R/functions.R | 104 --------------------------------------------
 2 files changed, 104 insertions(+), 104 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 3b7b8250b94f7..f97202ff69e22 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -2448,3 +2448,107 @@ setMethod("drop",
           function(x) {
             base::drop(x)
           })
+
+#' This function computes a histogram for a given SparkR Column.
+#' 
+#' @name histogram
+#' @title Histogram
+#' @param nbins the number of bins (optional). Default value is 10.
+#' @param df the DataFrame containing the Column to build the histogram from.
+#' @param colname the name of the column to build the histogram from.
+#' @return a data.frame with the histogram statistics, i.e., counts and centroids.
+#' @rdname histogram
+#' @family agg_funcs
+#' @export
+#' @examples 
+#' \dontrun{
+#' # Create a DataFrame from the Iris dataset
+#' irisDF <- createDataFrame(sqlContext, iris)
+#' 
+#' # Compute histogram statistics
+#' histData <- histogram(df, "colname"Sepal_Length", nbins = 12)
+#'
+#' # Once SparkR has computed the histogram statistics, the histogram can be
+#' # rendered using the ggplot2 library:
+#'
+#' require(ggplot2)
+#' plot <- ggplot(histStats, aes(x = centroids, y = counts))
+#' plot <- plot + geom_histogram(data = histStats, stat = "identity", binwidth = 100)
+#' plot <- plot + xlab("Sepal_Length") + ylab("Frequency")   
+#' } 
+setMethod("histogram",
+          signature(df = "DataFrame", col = "characterOrColumn"),
+          function(df, col, nbins = 10) {
+            # Validate nbins
+            if (nbins < 2) {
+              stop("The number of bins must be a positive integer number greater than 1.")
+            }
+
+            # Round nbins to the smallest integer
+            nbins <- floor(nbins)
+
+            # Validate col
+            if (is.null(col)) {
+              stop("col must be specified.")
+            }
+
+            colname <- col
+            x <- if (class(col) == "character") {
+              if (!colname %in% names(df)) {
+                stop("Specified colname does not belong to the given DataFrame.")
+              }
+
+              # Filter NA values in the target column and remove all other columns
+              df <- na.omit(df[, colname])
+              getColumn(df, colname)
+
+            } else if (class(col) == "Column") {
+
+              # Append the given column to the dataset. This is to support Columns that
+              # don't belong to the DataFrame but are rather expressions
+              df$x <- col
+
+              # Filter NA values in the target column. Cannot remove all other columns
+              # since given Column may be an expression on one or more existing columns
+              df <- na.omit(df)
+
+              colname <- "x"
+              col
+            }
+
+            # At this point, df only has one column: the one to compute the histogram from
+            stats <- collect(describe(df[, colname]))
+            min <- as.numeric(stats[4, 2])
+            max <- as.numeric(stats[5, 2])
+
+            # Normalize the data
+            xnorm <- (x - min) / (max - min)
+
+            # Round the data to 4 significant digits. This is to avoid rounding issues.
+            xnorm <- cast(xnorm * 10000, "integer") / 10000.0
+
+            # Since min = 0, max = 1 (data is already normalized)
+            normBinSize <- 1 / nbins
+            binsize <- (max - min) / nbins
+            approxBins <- xnorm / normBinSize
+
+            # Adjust values that are equal to the upper bound of each bin
+            bins <- cast(approxBins -
+                           ifelse(approxBins == cast(approxBins, "integer") & x != min, 1, 0),
+                         "integer")
+
+            df$bins <- bins
+            histStats <- collect(count(groupBy(df, "bins")))
+            names(histStats) <- c("bins", "counts")
+
+            # Fill bins with zero counts
+            y <- data.frame("bins" = seq(0, nbins - 1))
+            histStats <- merge(histStats, y, all.x = T, all.y = T)
+            histStats[is.na(histStats$count), 2] <- 0
+
+            # Compute centroids
+            histStats$centroids <- histStats$bins * binsize + min + binsize / 2
+
+            # Return the statistics
+            return(histStats)
+          })
\ No newline at end of file
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index f64f078f3de13..e5521f3cffadf 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -2638,107 +2638,3 @@ setMethod("sort_array",
             jc <- callJStatic("org.apache.spark.sql.functions", "sort_array", x@jc, asc)
             column(jc)
           })
-
-#' This function computes a histogram for a given SparkR Column.
-#' 
-#' @name histogram
-#' @title Histogram
-#' @param nbins the number of bins (optional). Default value is 10.
-#' @param df the DataFrame containing the Column to build the histogram from.
-#' @param colname the name of the column to build the histogram from.
-#' @return a data.frame with the histogram statistics, i.e., counts and centroids.
-#' @rdname histogram
-#' @family agg_funcs
-#' @export
-#' @examples 
-#' \dontrun{
-#' # Create a DataFrame from the Iris dataset
-#' irisDF <- createDataFrame(sqlContext, iris)
-#' 
-#' # Compute histogram statistics
-#' histData <- histogram(df, "colname"Sepal_Length", nbins = 12)
-#'
-#' # Once SparkR has computed the histogram statistics, the histogram can be
-#' # rendered using the ggplot2 library:
-#'
-#' require(ggplot2)
-#' plot <- ggplot(histStats, aes(x = centroids, y = counts))
-#' plot <- plot + geom_histogram(data = histStats, stat = "identity", binwidth = 100)
-#' plot <- plot + xlab("Sepal_Length") + ylab("Frequency")   
-#' } 
-setMethod("histogram",
-          signature(df = "DataFrame", col = "characterOrColumn"),
-          function(df, col, nbins = 10) {
-            # Validate nbins
-            if (nbins < 2) {
-              stop("The number of bins must be a positive integer number greater than 1.")
-            }
-
-            # Round nbins to the smallest integer
-            nbins <- floor(nbins)
-
-            # Validate col
-            if (is.null(col)) {
-              stop("col must be specified.")
-            }
-
-            colname <- col
-            x <- if (class(col) == "character") {
-              if (!colname %in% names(df)) {
-                stop("Specified colname does not belong to the given DataFrame.")
-              }
-
-              # Filter NA values in the target column and remove all other columns
-              df <- na.omit(df[, colname])
-
-              # TODO: This will be when improved SPARK-9325 or SPARK-13436 are fixed
-              getColumn(df, colname)
-            } else if (class(col) == "Column") {
-              # Append the given column to the dataset. This is to support Columns that
-              # don't belong to the DataFrame but are rather expressions
-              df$x <- col
-
-              # Filter NA values in the target column. Cannot remove all other columns
-              # since given Column may be an expression on one or more existing columns
-              df <- na.omit(df)
-
-              colname <- "x"
-              col
-            }
-
-            # At this point, df only has one column: the one to compute the histogram from
-            stats <- collect(describe(df[, colname]))
-            min <- as.numeric(stats[4, 2])
-            max <- as.numeric(stats[5, 2])
-
-            # Normalize the data
-            xnorm <- (x - min) / (max - min)
-
-            # Round the data to 4 significant digits. This is to avoid rounding issues.
-            xnorm <- cast(xnorm * 10000, "integer") / 10000.0
-
-            # Since min = 0, max = 1 (data is already normalized)
-            normBinSize <- 1 / nbins
-            binsize <- (max - min) / nbins
-            approxBins <- xnorm / normBinSize
-
-            # Adjust values that are equal to the upper bound of each bin
-            bins <- cast(approxBins -
-                         ifelse(approxBins == cast(approxBins, "integer") & x != min, 1, 0),
-                         "integer")
-
-            df$bins <- bins
-            histStats <- collect(count(groupBy(df, "bins")))
-            names(histStats) <- c("bins", "counts")
-
-            # Fill bins with zero counts
-            y <- data.frame("bins" = seq(0, nbins - 1))
-            histStats <- merge(histStats, y, all.x = T, all.y = T)
-            histStats[is.na(histStats$count), 2] <- 0
-
-            # Compute centroids
-            histStats$centroids <- histStats$bins * binsize + min + binsize / 2
-
-            # Return the statistics
-            return(histStats)
-         })

From fc4c536ca55fe4beefc27139dad03093cff7194e Mon Sep 17 00:00:00 2001
From: "Oscar D. Lara Yejas" <odlaraye@oscars-mbp.attlocal.net>
Date: Fri, 22 Apr 2016 10:37:52 -0700
Subject: [PATCH 16/22] Minor docs fix

---
 R/pkg/R/DataFrame.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 5646d3a93f84d..780009453f001 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -2474,7 +2474,7 @@ setMethod("drop",
 #' @param colname the name of the column to build the histogram from.
 #' @return a data.frame with the histogram statistics, i.e., counts and centroids.
 #' @rdname histogram
-#' @family agg_funcs
+#' @family DataFrame functions
 #' @export
 #' @examples 
 #' \dontrun{

From 976e412e7cdcbee95164f05eaf088e5ec7b08160 Mon Sep 17 00:00:00 2001
From: "Oscar D. Lara Yejas" <odlaraye@oscars-mbp.usca.ibm.com>
Date: Mon, 25 Apr 2016 14:47:04 -0700
Subject: [PATCH 17/22] pkg/R/DataFrame.R

---
 R/pkg/R/DataFrame.R | 1 -
 1 file changed, 1 deletion(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index afadf8406eaaa..38ca550f3ee8b 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -2469,7 +2469,6 @@ setMethod("drop",
             base::drop(x)
           })
 
-<<<<<<< HEAD
 #' This function computes a histogram for a given SparkR Column.
 #' 
 #' @name histogram

From 96714fdede2dd42f348357ded908305589aecc91 Mon Sep 17 00:00:00 2001
From: "Oscar D. Lara Yejas" <odlaraye@oscars-mbp.usca.ibm.com>
Date: Tue, 26 Apr 2016 12:28:29 -0700
Subject: [PATCH 18/22] Added dynamic colname generation to avoid colliding
 with existing columns

---
 R/pkg/R/DataFrame.R | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 38ca550f3ee8b..88e5c9b096241 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -2478,7 +2478,7 @@ setMethod("drop",
 #' @param colname the name of the column to build the histogram from.
 #' @return a data.frame with the histogram statistics, i.e., counts and centroids.
 #' @rdname histogram
-#' @family DataFrame functions
+#' @family SparkDataFrame functions
 #' @export
 #' @examples 
 #' \dontrun{
@@ -2486,7 +2486,7 @@ setMethod("drop",
 #' irisDF <- createDataFrame(sqlContext, iris)
 #' 
 #' # Compute histogram statistics
-#' histData <- histogram(df, "colname"Sepal_Length", nbins = 12)
+#' histData <- histogram(df, df$Sepal_Length, nbins = 12)
 #'
 #' # Once SparkR has computed the histogram statistics, the histogram can be
 #' # rendered using the ggplot2 library:
@@ -2497,7 +2497,7 @@ setMethod("drop",
 #' plot <- plot + xlab("Sepal_Length") + ylab("Frequency")   
 #' } 
 setMethod("histogram",
-          signature(df = "DataFrame", col = "characterOrColumn"),
+          signature(df = "SparkDataFrame", col = "characterOrColumn"),
           function(df, col, nbins = 10) {
             # Validate nbins
             if (nbins < 2) {
@@ -2524,15 +2524,30 @@ setMethod("histogram",
 
             } else if (class(col) == "Column") {
 
+              # The given column needs to be appended to the SparkDataFrame so that we can
+              # use method describe() to compute statistics in one single pass. The new
+              # column must have a name that doesn't exist in the dataset.
+              # To do so, we generate a random column name with more characters than the
+              # longest colname in the dataset, but no more than 100 (think of a UUID).
+              # This column name will never be visible to the user, so the name is irrelevant.
+              # Limiting the colname length to 100 makes debugging easier and it does
+              # introduce a negligible probability of collision: assuming the user has 1 million
+              # columns AND all of them have names 100 characters long (which is very unlikely),
+              # AND they run 1 billion histograms, the probability of collision will roughly be
+              # 1 in 4.4 x 10 ^ 96
+              colname <- paste(base:::sample(c(letters, LETTERS),
+                                             size = min(max(nchar(colnames(df))) + 1, 100),
+                                             replace=TRUE),
+                               collapse="")
+
               # Append the given column to the dataset. This is to support Columns that
               # don't belong to the DataFrame but are rather expressions
-              df$x <- col
+              df <- withColumn(df, colname, col)
 
               # Filter NA values in the target column. Cannot remove all other columns
               # since given Column may be an expression on one or more existing columns
               df <- na.omit(df)
 
-              colname <- "x"
               col
             }
 

From cd7ba4c3af26beba4ac4c0f09ea6f3560069d5a4 Mon Sep 17 00:00:00 2001
From: "Oscar D. Lara Yejas" <odlaraye@oscars-mbp.usca.ibm.com>
Date: Tue, 26 Apr 2016 14:14:41 -0700
Subject: [PATCH 19/22] Fixed ggplot example

---
 R/pkg/R/DataFrame.R | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 88e5c9b096241..6946caef7f95e 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -2482,19 +2482,20 @@ setMethod("drop",
 #' @export
 #' @examples 
 #' \dontrun{
+#' 
 #' # Create a DataFrame from the Iris dataset
 #' irisDF <- createDataFrame(sqlContext, iris)
 #' 
 #' # Compute histogram statistics
-#' histData <- histogram(df, df$Sepal_Length, nbins = 12)
+#' histStats <- histogram(irisDF, irisDF$Sepal_Length, nbins = 12)
 #'
 #' # Once SparkR has computed the histogram statistics, the histogram can be
 #' # rendered using the ggplot2 library:
 #'
 #' require(ggplot2)
-#' plot <- ggplot(histStats, aes(x = centroids, y = counts))
-#' plot <- plot + geom_histogram(data = histStats, stat = "identity", binwidth = 100)
-#' plot <- plot + xlab("Sepal_Length") + ylab("Frequency")   
+#' plot <- ggplot(histStats, aes(x = centroids, y = counts)) +
+#'         geom_bar(stat = "identity") +
+#'         xlab("Sepal_Length") + ylab("Frequency")   
 #' } 
 setMethod("histogram",
           signature(df = "SparkDataFrame", col = "characterOrColumn"),

From e9dbc5b27c258777a539723e0ad4676db928736b Mon Sep 17 00:00:00 2001
From: "Oscar D. Lara Yejas" <odlaraye@oscars-mbp.usca.ibm.com>
Date: Tue, 26 Apr 2016 14:24:28 -0700
Subject: [PATCH 20/22] Fixed style issues

---
 R/pkg/R/DataFrame.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 6946caef7f95e..577089c7c3c4f 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -2538,8 +2538,8 @@ setMethod("histogram",
               # 1 in 4.4 x 10 ^ 96
               colname <- paste(base:::sample(c(letters, LETTERS),
                                              size = min(max(nchar(colnames(df))) + 1, 100),
-                                             replace=TRUE),
-                               collapse="")
+                                             replace = TRUE),
+                               collapse = "")
 
               # Append the given column to the dataset. This is to support Columns that
               # don't belong to the DataFrame but are rather expressions

From fc2f6a31166ac895b5c2ce05074f5c7edf372706 Mon Sep 17 00:00:00 2001
From: "Oscar D. Lara Yejas" <odlaraye@oscars-mbp.usca.ibm.com>
Date: Tue, 26 Apr 2016 14:57:57 -0700
Subject: [PATCH 21/22] Changes DataFrame for SparkDataFrame

---
 R/pkg/R/DataFrame.R | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 577089c7c3c4f..36aedfae86b33 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -2474,7 +2474,7 @@ setMethod("drop",
 #' @name histogram
 #' @title Histogram
 #' @param nbins the number of bins (optional). Default value is 10.
-#' @param df the DataFrame containing the Column to build the histogram from.
+#' @param df the SparkDataFrame containing the Column to build the histogram from.
 #' @param colname the name of the column to build the histogram from.
 #' @return a data.frame with the histogram statistics, i.e., counts and centroids.
 #' @rdname histogram
@@ -2483,7 +2483,7 @@ setMethod("drop",
 #' @examples 
 #' \dontrun{
 #' 
-#' # Create a DataFrame from the Iris dataset
+#' # Create a SparkDataFrame from the Iris dataset
 #' irisDF <- createDataFrame(sqlContext, iris)
 #' 
 #' # Compute histogram statistics
@@ -2516,7 +2516,7 @@ setMethod("histogram",
             colname <- col
             x <- if (class(col) == "character") {
               if (!colname %in% names(df)) {
-                stop("Specified colname does not belong to the given DataFrame.")
+                stop("Specified colname does not belong to the given SparkDataFrame.")
               }
 
               # Filter NA values in the target column and remove all other columns
@@ -2542,7 +2542,7 @@ setMethod("histogram",
                                collapse = "")
 
               # Append the given column to the dataset. This is to support Columns that
-              # don't belong to the DataFrame but are rather expressions
+              # don't belong to the SparkDataFrame but are rather expressions
               df <- withColumn(df, colname, col)
 
               # Filter NA values in the target column. Cannot remove all other columns

From 838c9155839bbb7fd4d5f855a9d88ae68fef2ffb Mon Sep 17 00:00:00 2001
From: "Oscar D. Lara Yejas" <odlaraye@oscars-mbp.usca.ibm.com>
Date: Tue, 26 Apr 2016 15:18:06 -0700
Subject: [PATCH 22/22] Changed error message on histogram tests

---
 R/pkg/inst/tests/testthat/test_sparkSQL.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index 9e073d4b9ffc7..336068035eaf8 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -2003,7 +2003,7 @@ test_that("Histogram", {
 
   # Wrong colname
   expect_error(histogram(irisDF, "xxx"),
-               "Specified colname does not belong to the given DataFrame.")
+               "Specified colname does not belong to the given SparkDataFrame.")
 
   # Invalid nbins
   expect_error(histogram(irisDF, "Petal_Width", nbins = 0),