Skip to content

Commit

Permalink
merge conflicts
Browse files Browse the repository at this point in the history
  • Loading branch information
ifilonenko committed Sep 7, 2018
2 parents 7dc26ce + 9241e1e commit fe8cc5a
Show file tree
Hide file tree
Showing 238 changed files with 5,087 additions and 2,000 deletions.
2 changes: 1 addition & 1 deletion R/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ export R_HOME=/home/username/R

#### Build Spark

Build Spark with [Maven](http://spark.apache.org/docs/latest/building-spark.html#building-with-buildmvn) and include the `-Psparkr` profile to build the R package. For example to use the default Hadoop versions you can run
Build Spark with [Maven](http://spark.apache.org/docs/latest/building-spark.html#buildmvn) and include the `-Psparkr` profile to build the R package. For example to use the default Hadoop versions you can run

```bash
build/mvn -DskipTests -Psparkr package
Expand Down
2 changes: 1 addition & 1 deletion R/WINDOWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ directory in Maven in `PATH`.

4. Set `MAVEN_OPTS` as described in [Building Spark](http://spark.apache.org/docs/latest/building-spark.html).

5. Open a command shell (`cmd`) in the Spark directory and build Spark with [Maven](http://spark.apache.org/docs/latest/building-spark.html#building-with-buildmvn) and include the `-Psparkr` profile to build the R package. For example to use the default Hadoop versions you can run
5. Open a command shell (`cmd`) in the Spark directory and build Spark with [Maven](http://spark.apache.org/docs/latest/building-spark.html#buildmvn) and include the `-Psparkr` profile to build the R package. For example to use the default Hadoop versions you can run

```bash
mvn.cmd -DskipTests -Psparkr package
Expand Down
4 changes: 4 additions & 0 deletions R/pkg/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,8 @@ exportMethods("%<=>%",
"approxQuantile",
"array_contains",
"array_distinct",
"array_except",
"array_intersect",
"array_join",
"array_max",
"array_min",
Expand All @@ -212,6 +214,7 @@ exportMethods("%<=>%",
"array_repeat",
"array_sort",
"arrays_overlap",
"array_union",
"arrays_zip",
"asc",
"ascii",
Expand Down Expand Up @@ -355,6 +358,7 @@ exportMethods("%<=>%",
"shiftLeft",
"shiftRight",
"shiftRightUnsigned",
"shuffle",
"sd",
"sign",
"signum",
Expand Down
63 changes: 60 additions & 3 deletions R/pkg/R/functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ NULL
#' # Dataframe used throughout this doc
#' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))
#' tmp <- mutate(df, v1 = create_array(df$mpg, df$cyl, df$hp))
#' head(select(tmp, array_contains(tmp$v1, 21), size(tmp$v1)))
#' head(select(tmp, array_contains(tmp$v1, 21), size(tmp$v1), shuffle(tmp$v1)))
#' head(select(tmp, array_max(tmp$v1), array_min(tmp$v1), array_distinct(tmp$v1)))
#' head(select(tmp, array_position(tmp$v1, 21), array_repeat(df$mpg, 3), array_sort(tmp$v1)))
#' head(select(tmp, flatten(tmp$v1), reverse(tmp$v1), array_remove(tmp$v1, 21)))
Expand All @@ -223,6 +223,8 @@ NULL
#' head(select(tmp3, element_at(tmp3$v3, "Valiant")))
#' tmp4 <- mutate(df, v4 = create_array(df$mpg, df$cyl), v5 = create_array(df$cyl, df$hp))
#' head(select(tmp4, concat(tmp4$v4, tmp4$v5), arrays_overlap(tmp4$v4, tmp4$v5)))
#' head(select(tmp4, array_except(tmp4$v4, tmp4$v5), array_intersect(tmp4$v4, tmp4$v5)))
#' head(select(tmp4, array_union(tmp4$v4, tmp4$v5)))
#' head(select(tmp4, arrays_zip(tmp4$v4, tmp4$v5), map_from_arrays(tmp4$v4, tmp4$v5)))
#' head(select(tmp, concat(df$mpg, df$cyl, df$hp)))
#' tmp5 <- mutate(df, v6 = create_array(df$model, df$model))
Expand Down Expand Up @@ -1697,8 +1699,8 @@ setMethod("to_date",
})

#' @details
#' \code{to_json}: Converts a column containing a \code{structType}, array of \code{structType},
#' a \code{mapType} or array of \code{mapType} into a Column of JSON string.
#' \code{to_json}: Converts a column containing a \code{structType}, a \code{mapType}
#' or an \code{arrayType} into a Column of JSON string.
#' Resolving the Column can fail if an unsupported type is encountered.
#'
#' @rdname column_collection_functions
Expand Down Expand Up @@ -3024,6 +3026,34 @@ setMethod("array_distinct",
column(jc)
})

#' @details
#' \code{array_except}: Returns an array of the elements in the first array but not in the second
#' array, without duplicates. The order of elements in the result is not determined.
#'
#' @rdname column_collection_functions
#' @aliases array_except array_except,Column-method
#' @note array_except since 2.4.0
setMethod("array_except",
signature(x = "Column", y = "Column"),
function(x, y) {
jc <- callJStatic("org.apache.spark.sql.functions", "array_except", x@jc, y@jc)
column(jc)
})

#' @details
#' \code{array_intersect}: Returns an array of the elements in the intersection of the given two
#' arrays, without duplicates.
#'
#' @rdname column_collection_functions
#' @aliases array_intersect array_intersect,Column-method
#' @note array_intersect since 2.4.0
setMethod("array_intersect",
signature(x = "Column", y = "Column"),
function(x, y) {
jc <- callJStatic("org.apache.spark.sql.functions", "array_intersect", x@jc, y@jc)
column(jc)
})

#' @details
#' \code{array_join}: Concatenates the elements of column using the delimiter.
#' Null values are replaced with nullReplacement if set, otherwise they are ignored.
Expand Down Expand Up @@ -3149,6 +3179,20 @@ setMethod("arrays_overlap",
column(jc)
})

#' @details
#' \code{array_union}: Returns an array of the elements in the union of the given two arrays,
#' without duplicates.
#'
#' @rdname column_collection_functions
#' @aliases array_union array_union,Column-method
#' @note array_union since 2.4.0
setMethod("array_union",
signature(x = "Column", y = "Column"),
function(x, y) {
jc <- callJStatic("org.apache.spark.sql.functions", "array_union", x@jc, y@jc)
column(jc)
})

#' @details
#' \code{arrays_zip}: Returns a merged array of structs in which the N-th struct contains all N-th
#' values of input arrays.
Expand All @@ -3167,6 +3211,19 @@ setMethod("arrays_zip",
column(jc)
})

#' @details
#' \code{shuffle}: Returns a random permutation of the given array.
#'
#' @rdname column_collection_functions
#' @aliases shuffle shuffle,Column-method
#' @note shuffle since 2.4.0
setMethod("shuffle",
signature(x = "Column"),
function(x) {
jc <- callJStatic("org.apache.spark.sql.functions", "shuffle", x@jc)
column(jc)
})

#' @details
#' \code{flatten}: Creates a single array from an array of arrays.
#' If a structure of nested arrays is deeper than two levels, only one level of nesting is removed.
Expand Down
16 changes: 16 additions & 0 deletions R/pkg/R/generics.R
Original file line number Diff line number Diff line change
Expand Up @@ -767,6 +767,14 @@ setGeneric("array_contains", function(x, value) { standardGeneric("array_contain
#' @name NULL
setGeneric("array_distinct", function(x) { standardGeneric("array_distinct") })

#' @rdname column_collection_functions
#' @name NULL
setGeneric("array_except", function(x, y) { standardGeneric("array_except") })

#' @rdname column_collection_functions
#' @name NULL
setGeneric("array_intersect", function(x, y) { standardGeneric("array_intersect") })

#' @rdname column_collection_functions
#' @name NULL
setGeneric("array_join", function(x, delimiter, ...) { standardGeneric("array_join") })
Expand Down Expand Up @@ -799,6 +807,10 @@ setGeneric("array_sort", function(x) { standardGeneric("array_sort") })
#' @name NULL
setGeneric("arrays_overlap", function(x, y) { standardGeneric("arrays_overlap") })

#' @rdname column_collection_functions
#' @name NULL
setGeneric("array_union", function(x, y) { standardGeneric("array_union") })

#' @rdname column_collection_functions
#' @name NULL
setGeneric("arrays_zip", function(x, ...) { standardGeneric("arrays_zip") })
Expand Down Expand Up @@ -1220,6 +1232,10 @@ setGeneric("shiftRight", function(y, x) { standardGeneric("shiftRight") })
#' @name NULL
setGeneric("shiftRightUnsigned", function(y, x) { standardGeneric("shiftRightUnsigned") })

#' @rdname column_collection_functions
#' @name NULL
setGeneric("shuffle", function(x) { standardGeneric("shuffle") })

#' @rdname column_math_functions
#' @name NULL
setGeneric("signum", function(x) { standardGeneric("signum") })
Expand Down
5 changes: 3 additions & 2 deletions R/pkg/R/mllib_fpm.R
Original file line number Diff line number Diff line change
Expand Up @@ -116,10 +116,11 @@ setMethod("spark.freqItemsets", signature(object = "FPGrowthModel"),
# Get association rules.

#' @return A \code{SparkDataFrame} with association rules.
#' The \code{SparkDataFrame} contains three columns:
#' The \code{SparkDataFrame} contains four columns:
#' \code{antecedent} (an array of the same type as the input column),
#' \code{consequent} (an array of the same type as the input column),
#' and \code{condfidence} (confidence).
#' \code{condfidence} (confidence for the rule)
#' and \code{lift} (lift for the rule)
#' @rdname spark.fpGrowth
#' @aliases associationRules,FPGrowthModel-method
#' @note spark.associationRules(FPGrowthModel) since 2.2.0
Expand Down
3 changes: 2 additions & 1 deletion R/pkg/tests/fulltests/test_mllib_fpm.R
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ test_that("spark.fpGrowth", {
expected_association_rules <- data.frame(
antecedent = I(list(list("2"), list("3"))),
consequent = I(list(list("1"), list("1"))),
confidence = c(1, 1)
confidence = c(1, 1),
lift = c(1, 1)
)

expect_equivalent(expected_association_rules, collect(spark.associationRules(model)))
Expand Down
35 changes: 32 additions & 3 deletions R/pkg/tests/fulltests/test_sparkSQL.R
Original file line number Diff line number Diff line change
Expand Up @@ -1598,6 +1598,25 @@ test_that("column functions", {
result <- collect(select(df, element_at(df$map, "y")))[[1]]
expect_equal(result, 2)

# Test array_except(), array_intersect() and array_union()
df <- createDataFrame(list(list(list(1L, 2L, 3L), list(3L, 1L)),
list(list(1L, 2L), list(3L, 4L)),
list(list(1L, 2L, 3L), list(3L, 4L))))
result1 <- collect(select(df, array_except(df[[1]], df[[2]])))[[1]]
expect_equal(result1, list(list(2L), list(1L, 2L), list(1L, 2L)))

result2 <- collect(select(df, array_intersect(df[[1]], df[[2]])))[[1]]
expect_equal(result2, list(list(1L, 3L), list(), list(3L)))

result3 <- collect(select(df, array_union(df[[1]], df[[2]])))[[1]]
expect_equal(result3, list(list(1L, 2L, 3L), list(1L, 2L, 3L, 4L), list(1L, 2L, 3L, 4L)))

# Test shuffle()
df <- createDataFrame(list(list(list(1L, 20L, 3L, 5L)), list(list(4L, 5L, 6L, 7L))))
result <- collect(select(df, shuffle(df[[1]])))[[1]]
expect_true(setequal(result[[1]], c(1L, 20L, 3L, 5L)))
expect_true(setequal(result[[2]], c(4L, 5L, 6L, 7L)))

# Test that stats::lag is working
expect_equal(length(lag(ldeaths, 12)), 72)

Expand Down Expand Up @@ -1667,6 +1686,15 @@ test_that("column functions", {
expect_true(any(apply(s, 1, function(x) { x[[1]]$age == 16 })))
}

# Test to_json() supports arrays of primitive types and arrays
df <- sql("SELECT array(19, 42, 70) as age")
j <- collect(select(df, alias(to_json(df$age), "json")))
expect_equal(j[order(j$json), ][1], "[19,42,70]")

df <- sql("SELECT array(array(1, 2), array(3, 4)) as matrix")
j <- collect(select(df, alias(to_json(df$matrix), "json")))
expect_equal(j[order(j$json), ][1], "[[1,2],[3,4]]")

# passing option
df <- as.DataFrame(list(list("col" = "{\"date\":\"21/10/2014\"}")))
schema2 <- structType(structField("date", "date"))
Expand Down Expand Up @@ -1851,9 +1879,9 @@ test_that("date functions on a DataFrame", {
expect_equal(collect(select(df2, minute(df2$b)))[, 1], c(34, 24))
expect_equal(collect(select(df2, second(df2$b)))[, 1], c(0, 34))
expect_equal(collect(select(df2, from_utc_timestamp(df2$b, "JST")))[, 1],
c(as.POSIXlt("2012-12-13 21:34:00 UTC"), as.POSIXlt("2014-12-15 10:24:34 UTC")))
c(as.POSIXct("2012-12-13 21:34:00 UTC"), as.POSIXct("2014-12-15 10:24:34 UTC")))
expect_equal(collect(select(df2, to_utc_timestamp(df2$b, "JST")))[, 1],
c(as.POSIXlt("2012-12-13 03:34:00 UTC"), as.POSIXlt("2014-12-14 16:24:34 UTC")))
c(as.POSIXct("2012-12-13 03:34:00 UTC"), as.POSIXct("2014-12-14 16:24:34 UTC")))
expect_gt(collect(select(df2, unix_timestamp()))[1, 1], 0)
expect_gt(collect(select(df2, unix_timestamp(df2$b)))[1, 1], 0)
expect_gt(collect(select(df2, unix_timestamp(lit("2015-01-01"), "yyyy-MM-dd")))[1, 1], 0)
Expand Down Expand Up @@ -3633,7 +3661,8 @@ test_that("catalog APIs, currentDatabase, setCurrentDatabase, listDatabases", {
expect_equal(currentDatabase(), "default")
expect_error(setCurrentDatabase("default"), NA)
expect_error(setCurrentDatabase("zxwtyswklpf"),
"Error in setCurrentDatabase : analysis error - Database 'zxwtyswklpf' does not exist")
paste0("Error in setCurrentDatabase : analysis error - Database ",
"'zxwtyswklpf' does not exist"))
dbs <- collect(listDatabases())
expect_equal(names(dbs), c("name", "description", "locationUri"))
expect_equal(which(dbs[, 1] == "default"), 1)
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ storage systems. Because the protocols have changed in different versions of
Hadoop, you must build Spark against the same version that your cluster runs.

Please refer to the build documentation at
["Specifying the Hadoop Version"](http://spark.apache.org/docs/latest/building-spark.html#specifying-the-hadoop-version)
["Specifying the Hadoop Version and Enabling YARN"](http://spark.apache.org/docs/latest/building-spark.html#specifying-the-hadoop-version-and-enabling-yarn)
for detailed guidance on building for a particular distribution of Hadoop, including
building for particular Hive and Hive Thriftserver distributions.

Expand Down
2 changes: 1 addition & 1 deletion assembly/README
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ This module is off by default. To activate it specify the profile in the command

If you need to build an assembly for a different version of Hadoop the
hadoop-version system property needs to be set as in this example:
-Dhadoop.version=2.7.7
-Dhadoop.version=2.7.3
29 changes: 19 additions & 10 deletions build/mvn
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ install_app() {
fi
}

# See simple version normalization: http://stackoverflow.com/questions/16989598/bash-comparing-version-numbers
function version { echo "$@" | awk -F. '{ printf("%03d%03d%03d\n", $1,$2,$3); }'; }

# Determine the Maven version from the root pom.xml file and
# install maven under the build/ folder if needed.
install_mvn() {
Expand All @@ -75,8 +78,6 @@ install_mvn() {
if [ "$MVN_BIN" ]; then
local MVN_DETECTED_VERSION="$(mvn --version | head -n1 | awk '{print $3}')"
fi
# See simple version normalization: http://stackoverflow.com/questions/16989598/bash-comparing-version-numbers
function version { echo "$@" | awk -F. '{ printf("%03d%03d%03d\n", $1,$2,$3); }'; }
if [ $(version $MVN_DETECTED_VERSION) -lt $(version $MVN_VERSION) ]; then
local APACHE_MIRROR=${APACHE_MIRROR:-'https://www.apache.org/dyn/closer.lua?action=download&filename='}

Expand All @@ -91,15 +92,23 @@ install_mvn() {

# Install zinc under the build/ folder
install_zinc() {
local zinc_path="zinc-0.3.15/bin/zinc"
[ ! -f "${_DIR}/${zinc_path}" ] && ZINC_INSTALL_FLAG=1
local TYPESAFE_MIRROR=${TYPESAFE_MIRROR:-https://downloads.lightbend.com}
local ZINC_VERSION=0.3.15
ZINC_BIN="$(command -v zinc)"
if [ "$ZINC_BIN" ]; then
local ZINC_DETECTED_VERSION="$(zinc -version | head -n1 | awk '{print $5}')"
fi

install_app \
"${TYPESAFE_MIRROR}/zinc/0.3.15" \
"zinc-0.3.15.tgz" \
"${zinc_path}"
ZINC_BIN="${_DIR}/${zinc_path}"
if [ $(version $ZINC_DETECTED_VERSION) -lt $(version $ZINC_VERSION) ]; then
local zinc_path="zinc-${ZINC_VERSION}/bin/zinc"
[ ! -f "${_DIR}/${zinc_path}" ] && ZINC_INSTALL_FLAG=1
local TYPESAFE_MIRROR=${TYPESAFE_MIRROR:-https://downloads.lightbend.com}

install_app \
"${TYPESAFE_MIRROR}/zinc/${ZINC_VERSION}" \
"zinc-${ZINC_VERSION}.tgz" \
"${zinc_path}"
ZINC_BIN="${_DIR}/${zinc_path}"
fi
}

# Determine the Scala version from the root pom.xml file, set the Scala URL,
Expand Down

0 comments on commit fe8cc5a

Please sign in to comment.