From 99109c4b592c57d1ba00a002b3d4b71ece10f954 Mon Sep 17 00:00:00 2001 From: felixcheung Date: Sun, 23 Aug 2015 03:06:33 -0700 Subject: [PATCH 1/4] R: Add support for subsetting + tests --- R/pkg/R/DataFrame.R | 10 ++++++++++ R/pkg/inst/tests/test_sparkSQL.R | 16 ++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 895603235011e..13558bc270475 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -945,6 +945,13 @@ setMethod("[", signature(x = "DataFrame", i = "missing"), select(x, j) }) +#' @rdname select +setMethod("[", signature(x = "DataFrame"), + function(x, i, j, ...) { + filtered <- filter(x, i) + filtered[, j] + }) + #' Select #' #' Selects a set of columns with names or Column expressions. @@ -963,8 +970,11 @@ setMethod("[", signature(x = "DataFrame", i = "missing"), #' # Columns can also be selected using `[[` and `[` #' df[[2]] == df[["age"]] #' df[,2] == df[,"age"] +#' df[,c("name", "age")] #' # Similar to R data frames columns can also be selected using `$` #' df$age +#' # It can also be subset on rows and Columns +#' df[df$name == "Smith", 2] #' } setMethod("select", signature(x = "DataFrame", col = "character"), function(x, col, ...) { diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index 556b8c5447054..19a776b96a2c4 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -587,6 +587,22 @@ test_that("select with column", { expect_equal(collect(select(df3, "x"))[[1, 1]], "x") }) +test_that("subsetting", { + df <- jsonFile(sqlContext, jsonPath) + filtered <- df[df$age > 20,] + expect_equal(count(filtered), 1) + expect_equal(columns(filtered), c("name", "age")) + expect_equal(collect(filtered)$name, "Andy") + df2 <- df[df$age == 19,1] + expect_is(df2, "DataFrame") + expect_equal(count(df2), 1) + expect_equal(columns(df2), c("name")) + expect_equal(collect(df2)$name, "Justin") + df3 <- df[df$age > 20, 2] + expect_equal(count(df3), 1) + expect_equal(columns(df3), c("age")) +}} + test_that("selectExpr() on a DataFrame", { df <- jsonFile(sqlContext, jsonPath) selected <- selectExpr(df, "age * 2") From 42e881a4e4c7d3e669400b44ec24c4af1e10f6da Mon Sep 17 00:00:00 2001 From: felixcheung Date: Mon, 24 Aug 2015 00:46:26 -0700 Subject: [PATCH 2/4] add support for d[d$something>0,], more tests --- R/pkg/R/DataFrame.R | 16 ++++++++++++---- R/pkg/inst/tests/test_sparkSQL.R | 13 ++++++++++--- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 13558bc270475..7f535f3490290 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -922,8 +922,10 @@ setMethod("$<-", signature(x = "DataFrame"), x }) +setClassUnion("numericOrcharacter", c("numeric", "character")) + #' @rdname select -setMethod("[[", signature(x = "DataFrame"), +setMethod("[[", signature(x = "DataFrame", i = "numericOrcharacter"), function(x, i) { if (is.numeric(i)) { cols <- columns(x) @@ -946,10 +948,16 @@ setMethod("[", signature(x = "DataFrame", i = "missing"), }) #' @rdname select -setMethod("[", signature(x = "DataFrame"), +setMethod("[", signature(x = "DataFrame", i = "Column"), function(x, i, j, ...) { + # It could handle i as "character" but it seems confusing and not required + # https://stat.ethz.ch/R-manual/R-devel/library/base/html/Extract.data.frame.html filtered <- filter(x, i) - filtered[, j] + if (!missing(j)) { + filtered[, j] + } else { + filtered + } }) #' Select @@ -974,7 +982,7 @@ setMethod("[", signature(x = "DataFrame"), #' # Similar to R data frames columns can also be selected using `$` #' df$age #' # It can also be subset on rows and Columns -#' df[df$name == "Smith", 2] +#' df[df$name == "Smith", c(1,2)] #' } setMethod("select", signature(x = "DataFrame", col = "character"), function(x, col, ...) { diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index 19a776b96a2c4..003b9778fcadc 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -588,12 +588,13 @@ test_that("select with column", { }) test_that("subsetting", { - df <- jsonFile(sqlContext, jsonPath) + # jsonFile returns columns in random order + df <- select(jsonFile(sqlContext, jsonPath), "name", "age") filtered <- df[df$age > 20,] expect_equal(count(filtered), 1) expect_equal(columns(filtered), c("name", "age")) expect_equal(collect(filtered)$name, "Andy") - df2 <- df[df$age == 19,1] + df2 <- df[df$age == 19, 1] expect_is(df2, "DataFrame") expect_equal(count(df2), 1) expect_equal(columns(df2), c("name")) @@ -601,7 +602,13 @@ test_that("subsetting", { df3 <- df[df$age > 20, 2] expect_equal(count(df3), 1) expect_equal(columns(df3), c("age")) -}} + df4 <- df[df$age %in% c(19, 30), 1:2] + expect_equal(count(df4), 2) + expect_equal(columns(df4), c("name", "age")) + df5 <- df[df$age %in% c(19), c(1,2)] + expect_equal(count(df5), 1) + expect_equal(columns(df5), c("name", "age")) +}) test_that("selectExpr() on a DataFrame", { df <- jsonFile(sqlContext, jsonPath) From 16e0ba375ac12788478002ce96ca74206c2d437a Mon Sep 17 00:00:00 2001 From: felixcheung Date: Mon, 24 Aug 2015 00:47:15 -0700 Subject: [PATCH 3/4] update example --- R/pkg/R/DataFrame.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 7f535f3490290..c91d05ef0e604 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -983,6 +983,7 @@ setMethod("[", signature(x = "DataFrame", i = "Column"), #' df$age #' # It can also be subset on rows and Columns #' df[df$name == "Smith", c(1,2)] +#' df[df$age %in% c(19, 30), 1:2] #' } setMethod("select", signature(x = "DataFrame", col = "character"), function(x, col, ...) { From 3578ba29ef1bef3fead1232278fb23dd28d62c0f Mon Sep 17 00:00:00 2001 From: felixcheung Date: Mon, 24 Aug 2015 21:39:36 -0700 Subject: [PATCH 4/4] updating test formatting from feedback --- R/pkg/inst/tests/test_sparkSQL.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index 003b9778fcadc..ee48a3dc0cc05 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -594,17 +594,21 @@ test_that("subsetting", { expect_equal(count(filtered), 1) expect_equal(columns(filtered), c("name", "age")) expect_equal(collect(filtered)$name, "Andy") + df2 <- df[df$age == 19, 1] expect_is(df2, "DataFrame") expect_equal(count(df2), 1) expect_equal(columns(df2), c("name")) expect_equal(collect(df2)$name, "Justin") + df3 <- df[df$age > 20, 2] expect_equal(count(df3), 1) expect_equal(columns(df3), c("age")) + df4 <- df[df$age %in% c(19, 30), 1:2] expect_equal(count(df4), 2) expect_equal(columns(df4), c("name", "age")) + df5 <- df[df$age %in% c(19), c(1,2)] expect_equal(count(df5), 1) expect_equal(columns(df5), c("name", "age"))