Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/master' into setOpsTest
Browse files Browse the repository at this point in the history
  • Loading branch information
gatorsmile committed Nov 26, 2018
2 parents 14a927d + 6339c8c commit 5cfe08d
Show file tree
Hide file tree
Showing 42 changed files with 270 additions and 95 deletions.
1 change: 1 addition & 0 deletions R/pkg/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ exportMethods("arrange",
"toJSON",
"transform",
"union",
"unionAll",
"unionByName",
"unique",
"unpersist",
Expand Down
14 changes: 14 additions & 0 deletions R/pkg/R/DataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -2732,6 +2732,20 @@ setMethod("union",
dataFrame(unioned)
})

#' Return a new SparkDataFrame containing the union of rows
#'
#' This is an alias for `union`.
#'
#' @rdname union
#' @name unionAll
#' @aliases unionAll,SparkDataFrame,SparkDataFrame-method
#' @note unionAll since 1.4.0
setMethod("unionAll",
signature(x = "SparkDataFrame", y = "SparkDataFrame"),
function(x, y) {
union(x, y)
})

#' Return a new SparkDataFrame containing the union of rows, matched by column names
#'
#' Return a new SparkDataFrame containing the union of rows in this SparkDataFrame
Expand Down
3 changes: 3 additions & 0 deletions R/pkg/R/generics.R
Original file line number Diff line number Diff line change
Expand Up @@ -631,6 +631,9 @@ setGeneric("toRDD", function(x) { standardGeneric("toRDD") })
#' @rdname union
setGeneric("union", function(x, y) { standardGeneric("union") })

#' @rdname union
setGeneric("unionAll", function(x, y) { standardGeneric("unionAll") })

#' @rdname unionByName
setGeneric("unionByName", function(x, y) { standardGeneric("unionByName") })

Expand Down
4 changes: 2 additions & 2 deletions R/pkg/R/stats.R
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ setMethod("corr",
#'
#' Finding frequent items for columns, possibly with false positives.
#' Using the frequent element count algorithm described in
#' \url{http://dx.doi.org/10.1145/762471.762473}, proposed by Karp, Schenker, and Papadimitriou.
#' \url{https://doi.org/10.1145/762471.762473}, proposed by Karp, Schenker, and Papadimitriou.
#'
#' @param x A SparkDataFrame.
#' @param cols A vector column names to search frequent items in.
Expand Down Expand Up @@ -143,7 +143,7 @@ setMethod("freqItems", signature(x = "SparkDataFrame", cols = "character"),
#' *exact* rank of x is close to (p * N). More precisely,
#' floor((p - err) * N) <= rank(x) <= ceil((p + err) * N).
#' This method implements a variation of the Greenwald-Khanna algorithm (with some speed
#' optimizations). The algorithm was first present in [[http://dx.doi.org/10.1145/375663.375670
#' optimizations). The algorithm was first present in [[https://doi.org/10.1145/375663.375670
#' Space-efficient Online Computation of Quantile Summaries]] by Greenwald and Khanna.
#' Note that NA values will be ignored in numerical columns before calculation. For
#' columns only containing NA values, an empty list is returned.
Expand Down
1 change: 1 addition & 0 deletions R/pkg/tests/fulltests/test_sparkSQL.R
Original file line number Diff line number Diff line change
Expand Up @@ -2458,6 +2458,7 @@ test_that("union(), unionByName(), rbind(), except(), and intersect() on a DataF
expect_equal(count(unioned), 6)
expect_equal(first(unioned)$name, "Michael")
expect_equal(count(arrange(suppressWarnings(union(df, df2)), df$age)), 6)
expect_equal(count(arrange(suppressWarnings(unionAll(df, df2)), df$age)), 6)

df1 <- select(df2, "age", "name")
unioned1 <- arrange(unionByName(df1, df), df1$age)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -952,7 +952,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
*
* The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice:
* Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available
* <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
* <a href="https://doi.org/10.1145/2452376.2452456">here</a>.
*
* @param relativeSD Relative accuracy. Smaller values create counters that require more space.
* It must be greater than 0.000017.
Expand All @@ -969,7 +969,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
*
* The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice:
* Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available
* <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
* <a href="https://doi.org/10.1145/2452376.2452456">here</a>.
*
* @param relativeSD Relative accuracy. Smaller values create counters that require more space.
* It must be greater than 0.000017.
Expand All @@ -985,7 +985,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
*
* The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice:
* Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available
* <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
* <a href="https://doi.org/10.1145/2452376.2452456">here</a>.
*
* @param relativeSD Relative accuracy. Smaller values create counters that require more space.
* It must be greater than 0.000017.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -685,7 +685,7 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
*
* The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice:
* Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available
* <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
* <a href="https://doi.org/10.1145/2452376.2452456">here</a>.
*
* @param relativeSD Relative accuracy. Smaller values create counters that require more space.
* It must be greater than 0.000017.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
*
* The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice:
* Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available
* <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
* <a href="https://doi.org/10.1145/2452376.2452456">here</a>.
*
* The relative accuracy is approximately `1.054 / sqrt(2^p)`. Setting a nonzero (`sp` is
* greater than `p`) would trigger sparse representation of registers, which may reduce the
Expand Down Expand Up @@ -436,7 +436,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
*
* The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice:
* Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available
* <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
* <a href="https://doi.org/10.1145/2452376.2452456">here</a>.
*
* @param relativeSD Relative accuracy. Smaller values create counters that require more space.
* It must be greater than 0.000017.
Expand All @@ -456,7 +456,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
*
* The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice:
* Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available
* <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
* <a href="https://doi.org/10.1145/2452376.2452456">here</a>.
*
* @param relativeSD Relative accuracy. Smaller values create counters that require more space.
* It must be greater than 0.000017.
Expand All @@ -473,7 +473,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
*
* The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice:
* Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available
* <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
* <a href="https://doi.org/10.1145/2452376.2452456">here</a>.
*
* @param relativeSD Relative accuracy. Smaller values create counters that require more space.
* It must be greater than 0.000017.
Expand Down
4 changes: 2 additions & 2 deletions core/src/main/scala/org/apache/spark/rdd/RDD.scala
Original file line number Diff line number Diff line change
Expand Up @@ -1258,7 +1258,7 @@ abstract class RDD[T: ClassTag](
*
* The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice:
* Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available
* <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
* <a href="https://doi.org/10.1145/2452376.2452456">here</a>.
*
* The relative accuracy is approximately `1.054 / sqrt(2^p)`. Setting a nonzero (`sp` is greater
* than `p`) would trigger sparse representation of registers, which may reduce the memory
Expand Down Expand Up @@ -1290,7 +1290,7 @@ abstract class RDD[T: ClassTag](
*
* The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice:
* Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available
* <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
* <a href="https://doi.org/10.1145/2452376.2452456">here</a>.
*
* @param relativeSD Relative accuracy. Smaller values create counters that require more space.
* It must be greater than 0.000017.
Expand Down
4 changes: 2 additions & 2 deletions docs/ml-classification-regression.md
Original file line number Diff line number Diff line change
Expand Up @@ -941,9 +941,9 @@ Essentially isotonic regression is a
best fitting the original data points.

We implement a
[pool adjacent violators algorithm](http://doi.org/10.1198/TECH.2010.10111)
[pool adjacent violators algorithm](https://doi.org/10.1198/TECH.2010.10111)
which uses an approach to
[parallelizing isotonic regression](http://doi.org/10.1007/978-3-642-99789-1_10).
[parallelizing isotonic regression](https://doi.org/10.1007/978-3-642-99789-1_10).
The training input is a DataFrame which contains three columns
label, features and weight. Additionally, IsotonicRegression algorithm has one
optional parameter called $isotonic$ defaulting to true.
Expand Down
4 changes: 2 additions & 2 deletions docs/ml-collaborative-filtering.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ for example, users giving ratings to movies.

It is common in many real-world use cases to only have access to *implicit feedback* (e.g. views,
clicks, purchases, likes, shares etc.). The approach used in `spark.ml` to deal with such data is taken
from [Collaborative Filtering for Implicit Feedback Datasets](http://dx.doi.org/10.1109/ICDM.2008.22).
from [Collaborative Filtering for Implicit Feedback Datasets](https://doi.org/10.1109/ICDM.2008.22).
Essentially, instead of trying to model the matrix of ratings directly, this approach treats the data
as numbers representing the *strength* in observations of user actions (such as the number of clicks,
or the cumulative duration someone spent viewing a movie). Those numbers are then related to the level of
Expand All @@ -55,7 +55,7 @@ We scale the regularization parameter `regParam` in solving each least squares p
the number of ratings the user generated in updating user factors,
or the number of ratings the product received in updating product factors.
This approach is named "ALS-WR" and discussed in the paper
"[Large-Scale Parallel Collaborative Filtering for the Netflix Prize](http://dx.doi.org/10.1007/978-3-540-68880-8_32)".
"[Large-Scale Parallel Collaborative Filtering for the Netflix Prize](https://doi.org/10.1007/978-3-540-68880-8_32)".
It makes `regParam` less dependent on the scale of the dataset, so we can apply the
best parameter learned from a sampled subset to the full dataset and expect similar performance.

Expand Down
8 changes: 4 additions & 4 deletions docs/ml-frequent-pattern-mining.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,15 @@ for more information.
## FP-Growth

The FP-growth algorithm is described in the paper
[Han et al., Mining frequent patterns without candidate generation](http://dx.doi.org/10.1145/335191.335372),
[Han et al., Mining frequent patterns without candidate generation](https://doi.org/10.1145/335191.335372),
where "FP" stands for frequent pattern.
Given a dataset of transactions, the first step of FP-growth is to calculate item frequencies and identify frequent items.
Different from [Apriori-like](http://en.wikipedia.org/wiki/Apriori_algorithm) algorithms designed for the same purpose,
the second step of FP-growth uses a suffix tree (FP-tree) structure to encode transactions without generating candidate sets
explicitly, which are usually expensive to generate.
After the second step, the frequent itemsets can be extracted from the FP-tree.
In `spark.mllib`, we implemented a parallel version of FP-growth called PFP,
as described in [Li et al., PFP: Parallel FP-growth for query recommendation](http://dx.doi.org/10.1145/1454008.1454027).
as described in [Li et al., PFP: Parallel FP-growth for query recommendation](https://doi.org/10.1145/1454008.1454027).
PFP distributes the work of growing FP-trees based on the suffixes of transactions,
and hence is more scalable than a single-machine implementation.
We refer users to the papers for more details.
Expand Down Expand Up @@ -90,7 +90,7 @@ Refer to the [R API docs](api/R/spark.fpGrowth.html) for more details.

PrefixSpan is a sequential pattern mining algorithm described in
[Pei et al., Mining Sequential Patterns by Pattern-Growth: The
PrefixSpan Approach](http://dx.doi.org/10.1109%2FTKDE.2004.77). We refer
PrefixSpan Approach](https://doi.org/10.1109%2FTKDE.2004.77). We refer
the reader to the referenced paper for formalizing the sequential
pattern mining problem.

Expand Down Expand Up @@ -137,4 +137,4 @@ Refer to the [R API docs](api/R/spark.prefixSpan.html) for more details.
{% include_example r/ml/prefixSpan.R %}
</div>

</div>
</div>
4 changes: 2 additions & 2 deletions docs/mllib-collaborative-filtering.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ for example, users giving ratings to movies.

It is common in many real-world use cases to only have access to *implicit feedback* (e.g. views,
clicks, purchases, likes, shares etc.). The approach used in `spark.mllib` to deal with such data is taken
from [Collaborative Filtering for Implicit Feedback Datasets](http://dx.doi.org/10.1109/ICDM.2008.22).
from [Collaborative Filtering for Implicit Feedback Datasets](https://doi.org/10.1109/ICDM.2008.22).
Essentially, instead of trying to model the matrix of ratings directly, this approach treats the data
as numbers representing the *strength* in observations of user actions (such as the number of clicks,
or the cumulative duration someone spent viewing a movie). Those numbers are then related to the level of
Expand All @@ -51,7 +51,7 @@ Since v1.1, we scale the regularization parameter `lambda` in solving each least
the number of ratings the user generated in updating user factors,
or the number of ratings the product received in updating product factors.
This approach is named "ALS-WR" and discussed in the paper
"[Large-Scale Parallel Collaborative Filtering for the Netflix Prize](http://dx.doi.org/10.1007/978-3-540-68880-8_32)".
"[Large-Scale Parallel Collaborative Filtering for the Netflix Prize](https://doi.org/10.1007/978-3-540-68880-8_32)".
It makes `lambda` less dependent on the scale of the dataset, so we can apply the
best parameter learned from a sampled subset to the full dataset and expect similar performance.

Expand Down
6 changes: 3 additions & 3 deletions docs/mllib-frequent-pattern-mining.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,15 @@ a popular algorithm to mining frequent itemsets.
## FP-growth

The FP-growth algorithm is described in the paper
[Han et al., Mining frequent patterns without candidate generation](http://dx.doi.org/10.1145/335191.335372),
[Han et al., Mining frequent patterns without candidate generation](https://doi.org/10.1145/335191.335372),
where "FP" stands for frequent pattern.
Given a dataset of transactions, the first step of FP-growth is to calculate item frequencies and identify frequent items.
Different from [Apriori-like](http://en.wikipedia.org/wiki/Apriori_algorithm) algorithms designed for the same purpose,
the second step of FP-growth uses a suffix tree (FP-tree) structure to encode transactions without generating candidate sets
explicitly, which are usually expensive to generate.
After the second step, the frequent itemsets can be extracted from the FP-tree.
In `spark.mllib`, we implemented a parallel version of FP-growth called PFP,
as described in [Li et al., PFP: Parallel FP-growth for query recommendation](http://dx.doi.org/10.1145/1454008.1454027).
as described in [Li et al., PFP: Parallel FP-growth for query recommendation](https://doi.org/10.1145/1454008.1454027).
PFP distributes the work of growing FP-trees based on the suffixes of transactions,
and hence more scalable than a single-machine implementation.
We refer users to the papers for more details.
Expand Down Expand Up @@ -122,7 +122,7 @@ Refer to the [`AssociationRules` Java docs](api/java/org/apache/spark/mllib/fpm/

PrefixSpan is a sequential pattern mining algorithm described in
[Pei et al., Mining Sequential Patterns by Pattern-Growth: The
PrefixSpan Approach](http://dx.doi.org/10.1109%2FTKDE.2004.77). We refer
PrefixSpan Approach](https://doi.org/10.1109%2FTKDE.2004.77). We refer
the reader to the referenced paper for formalizing the sequential
pattern mining problem.

Expand Down
4 changes: 2 additions & 2 deletions docs/mllib-isotonic-regression.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ Essentially isotonic regression is a
best fitting the original data points.

`spark.mllib` supports a
[pool adjacent violators algorithm](http://doi.org/10.1198/TECH.2010.10111)
[pool adjacent violators algorithm](https://doi.org/10.1198/TECH.2010.10111)
which uses an approach to
[parallelizing isotonic regression](http://doi.org/10.1007/978-3-642-99789-1_10).
[parallelizing isotonic regression](https://doi.org/10.1007/978-3-642-99789-1_10).
The training input is an RDD of tuples of three double values that represent
label, feature and weight in this order. Additionally, IsotonicRegression algorithm has one
optional parameter called $isotonic$ defaulting to true.
Expand Down
2 changes: 1 addition & 1 deletion docs/sparkr.md
Original file line number Diff line number Diff line change
Expand Up @@ -718,4 +718,4 @@ You can inspect the search path in R with [`search()`](https://stat.ethz.ch/R-ma
## Upgrading to SparkR 3.0.0

- The deprecated methods `sparkR.init`, `sparkRSQL.init`, `sparkRHive.init` have been removed. Use `sparkR.session` instead.
- The deprecated methods `parquetFile`, `saveAsParquetFile`, `jsonFile`, `registerTempTable`, `createExternalTable`, `dropTempTable`, `unionAll` have been removed. Use `read.parquet`, `write.parquet`, `read.json`, `createOrReplaceTempView`, `createTable`, `dropTempView`, `union` instead.
- The deprecated methods `parquetFile`, `saveAsParquetFile`, `jsonFile`, `registerTempTable`, `createExternalTable`, and `dropTempTable` have been removed. Use `read.parquet`, `write.parquet`, `read.json`, `createOrReplaceTempView`, `createTable`, `dropTempView`, `union` instead.
2 changes: 2 additions & 0 deletions docs/sql-migration-guide-upgrade.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ displayTitle: Spark SQL Upgrading Guide

## Upgrading From Spark SQL 2.4 to 3.0

- Since Spark 3.0, the Dataset and DataFrame API `unionAll` is not deprecated any more. It is an alias for `union`.

- In PySpark, when creating a `SparkSession` with `SparkSession.builder.getOrCreate()`, if there is an existing `SparkContext`, the builder was trying to update the `SparkConf` of the existing `SparkContext` with configurations specified to the builder, but the `SparkContext` is shared by all `SparkSession`s, so we should not update them. Since 3.0, the builder comes to not update the configurations. This is the same behavior as Java/Scala API in 2.3 and above. If you want to update them, you need to update them prior to creating a `SparkSession`.

- In Spark version 2.4 and earlier, the parser of JSON data source treats empty strings as null for some data types such as `IntegerType`. For `FloatType` and `DoubleType`, it fails on empty strings and throws exceptions. Since Spark 3.0, we disallow empty strings and will throw exceptions for data types except for `StringType` and `BinaryType`.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ object KMeansModel extends MLReadable[KMeansModel] {
/**
* K-means clustering with support for k-means|| initialization proposed by Bahmani et al.
*
* @see <a href="http://dx.doi.org/10.14778/2180912.2180915">Bahmani et al., Scalable k-means++.</a>
* @see <a href="https://doi.org/10.14778/2180912.2180915">Bahmani et al., Scalable k-means++.</a>
*/
@Since("1.5.0")
class KMeans @Since("1.5.0") (
Expand Down
4 changes: 2 additions & 2 deletions mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala
Original file line number Diff line number Diff line change
Expand Up @@ -118,10 +118,10 @@ private[fpm] trait FPGrowthParams extends Params with HasPredictionCol {
/**
* :: Experimental ::
* A parallel FP-growth algorithm to mine frequent itemsets. The algorithm is described in
* <a href="http://dx.doi.org/10.1145/1454008.1454027">Li et al., PFP: Parallel FP-Growth for Query
* <a href="https://doi.org/10.1145/1454008.1454027">Li et al., PFP: Parallel FP-Growth for Query
* Recommendation</a>. PFP distributes computation in such a way that each worker executes an
* independent group of mining tasks. The FP-Growth algorithm is described in
* <a href="http://dx.doi.org/10.1145/335191.335372">Han et al., Mining frequent patterns without
* <a href="https://doi.org/10.1145/335191.335372">Han et al., Mining frequent patterns without
* candidate generation</a>. Note null values in the itemsCol column are ignored during fit().
*
* @see <a href="http://en.wikipedia.org/wiki/Association_rule_learning">
Expand Down

0 comments on commit 5cfe08d

Please sign in to comment.