Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-19851] Add support for EVERY and ANY (SOME) aggregates #22047

Closed
wants to merge 13 commits into from
22 changes: 22 additions & 0 deletions python/pyspark/sql/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,28 @@ def countDistinct(col, *cols):
return Column(jc)


def every(col):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please keep the SQL functions and remove the function APIs. Thanks!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@gatorsmile Hi Sean, I have prepared two branches. One in which these new aggregate functions are extending from the base Max and Min class basically reusing code. The other in which we replace these aggregate expressions in the optimizer. Below are the links.

  1. branch-extend

  2. branch-rewrite

I would prefer option 1 because of the following reasons.

  1. Code changes are simpler
  2. Supports these aggregates as window expressions naturally. In the other option i have
    to block it.
  3. It seems to me for these simple mapping, we probably don't need a rewrite frame work. We could add it in the future if we need a little complex transformation.

Please let me know how we want to move forward with this. Thanks !!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 for option 1

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@cloud-fan Thank you very much for your response. I will create a new PR based on option-1 today and close this one.

"""Aggregate function: returns true if all values in a group are true.
"""
sc = SparkContext._active_spark_context
jc = sc._jvm.functions.every(_to_java_column(col))
return Column(jc)


def any(col):
"""Aggregate function: returns true if at least one value in the group is true.
"""
sc = SparkContext._active_spark_context
jc = sc._jvm.functions.any(_to_java_column(col))
return Column(jc)


def some(col):
"""Aggregate function: returns true if at least one value in the group is true.
"""
return any(col)


@since(1.3)
def first(col, ignorenulls=False):
"""Aggregate function: returns the first value in a group.
Expand Down
87 changes: 87 additions & 0 deletions python/pyspark/sql/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -1532,6 +1532,24 @@ def test_cov(self):
cov = df.stat.cov(u"a", "b")
self.assertTrue(abs(cov - 55.0 / 3) < 1e-6)

def test_every_any(self):
from pyspark.sql import functions
data = [
Row(key="a", value=False),
Row(key="a", value=True),
Row(key="a", value=False),
Row(key="b", value=True),
Row(key="b", value=True),
Row(key="c", value=False),
Row(key="d", value=True),
Row(key="d", value=None)
]
df = self.sc.parallelize(data).toDF()
df2 = df.select(functions.every(df.value).alias('a'),
functions.any(df.value).alias('b'),
functions.some(df.value).alias('c'))
self.assertEqual([Row(a=False, b=True, c=True)], df2.collect())

def test_crosstab(self):
df = self.sc.parallelize([Row(a=i % 3, b=i % 2) for i in range(1, 7)]).toDF()
ct = df.stat.crosstab(u"a", "b").collect()
Expand Down Expand Up @@ -3938,6 +3956,75 @@ def test_window_functions_cumulative_sum(self):
for r, ex in zip(rs, expected):
self.assertEqual(tuple(r), ex[:len(r)])

def test_window_functions_every_any(self):
df = self.spark.createDataFrame([
("a", False),
("a", True),
("a", False),
("b", True),
("b", True),
("c", False),
("d", True),
("d", None)
], ["key", "value"])
w = Window \
.partitionBy("key").orderBy("value") \
.rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
from pyspark.sql import functions as F
sel = df.select(df.key,
df.value,
F.every("value").over(w),
F.any("value").over(w),
F.some("value").over(w))
rs = sel.collect()
expected = [
("a", False, False, True, True),
("a", False, False, True, True),
("a", True, False, True, True),
("b", True, True, True, True),
("b", True, True, True, True),
("c", False, False, False, False),
("d", None, True, True, True),
("d", True, True, True, True)
]
for r, ex in zip(rs, expected):
self.assertEqual(tuple(r), ex[:len(r)])

def test_window_functions_every_any_without_partitionBy(self):
df = self.spark.createDataFrame([
(False,),
(True,),
(False,),
(True,),
(True,),
(False,),
(True,),
(None,)
], ["value"])
w1 = Window.orderBy("value").rowsBetween(Window.unboundedPreceding, 0)
w2 = Window.orderBy("value").rowsBetween(-1, 0)
from pyspark.sql import functions as F
sel = df.select(df.value,
F.every("value").over(w1),
F.any("value").over(w1),
F.some("value").over(w1),
F.every("value").over(w2),
F.any("value").over(w2),
F.some("value").over(w2))
rs = sel.collect()
expected = [
(None, None, None, None, None, None, None),
(False, False, False, False, False, False, False),
(False, False, False, False, False, False, False),
(False, False, False, False, False, False, False),
(True, False, True, True, False, True, True),
(True, False, True, True, True, True, True),
(True, False, True, True, True, True, True),
(True, False, True, True, True, True, True)
]
for r, ex in zip(rs, expected):
self.assertEqual(tuple(r), ex[:len(r)])

def test_collect_functions(self):
df = self.spark.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"])
from pyspark.sql import functions
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,9 @@ object FunctionRegistry {
expression[CollectList]("collect_list"),
expression[CollectSet]("collect_set"),
expression[CountMinSketchAgg]("count_min_sketch"),
expression[Every]("every"),
expression[AnyAgg]("any"),
expression[AnyAgg]("some"),

// string functions
expression[Ascii]("ascii"),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.sql.catalyst.expressions.aggregate

import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.util.TypeUtils
import org.apache.spark.sql.types._

@ExpressionDescription(
usage = "_FUNC_(expr) - Returns true if at least one value of `expr` is true.")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW, don't forget to add since.

case class AnyAgg(child: Expression) extends DeclarativeAggregate with ImplicitCastInputTypes {

override def children: Seq[Expression] = child :: Nil

override def nullable: Boolean = true

override def dataType: DataType = BooleanType

override def inputTypes: Seq[AbstractDataType] = Seq(BooleanType)

override def checkInputDataTypes(): TypeCheckResult =
TypeUtils.checkForBooleanExpr(child.dataType, "function any")

private lazy val some = AttributeReference("some", BooleanType)()

private lazy val valueSet = AttributeReference("valueSet", BooleanType)()

override lazy val aggBufferAttributes = some :: valueSet :: Nil

override lazy val initialValues: Seq[Expression] = Seq(
/* some = */ Literal.create(false, BooleanType),
/* valueSet = */ Literal.create(false, BooleanType)
)

override lazy val updateExpressions: Seq[Expression] = Seq(
/* some = */ Or(some, If (child.isNull, some, child)),
/* valueSet = */ valueSet || child.isNotNull
)

override lazy val mergeExpressions: Seq[Expression] = Seq(
/* some = */ Or(some.left, some.right),
/* valueSet */ valueSet.right || valueSet.left
)

override lazy val evaluateExpression: Expression =
If (valueSet, some, Literal.create(null, BooleanType))
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.sql.catalyst.expressions.aggregate

import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.util.TypeUtils
import org.apache.spark.sql.types._

@ExpressionDescription(
usage = "_FUNC_(expr) - Returns true if all values of `expr` are true.")
case class Every(child: Expression) extends DeclarativeAggregate with ImplicitCastInputTypes {

override def children: Seq[Expression] = child :: Nil

override def nullable: Boolean = true

override def dataType: DataType = BooleanType

override def inputTypes: Seq[AbstractDataType] = Seq(BooleanType)

override def checkInputDataTypes(): TypeCheckResult =
TypeUtils.checkForBooleanExpr(child.dataType, "function every")

private lazy val every = AttributeReference("every", BooleanType)()

private lazy val valueSet = AttributeReference("valueSet", BooleanType)()

override lazy val aggBufferAttributes = every :: valueSet :: Nil

override lazy val initialValues: Seq[Expression] = Seq(
/* every = */ Literal.create(true, BooleanType),
/* valueSet = */ Literal.create(false, BooleanType)
)

override lazy val updateExpressions: Seq[Expression] = Seq(
/* every = */ And(every, If (child.isNull, every, child)),
/* valueSet = */ valueSet || child.isNotNull
)

override lazy val mergeExpressions: Seq[Expression] = Seq(
/* every = */ And(every.left, every.right),
/* valueSet */ valueSet.right || valueSet.left
)

override lazy val evaluateExpression: Expression =
If (valueSet, every, Literal.create(null, BooleanType))
}
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,14 @@ object TypeUtils {
}
}

def checkForBooleanExpr(dt: DataType, caller: String): TypeCheckResult = {
if (dt.isInstanceOf[BooleanType] || dt == NullType) {
TypeCheckResult.TypeCheckSuccess
} else {
TypeCheckResult.TypeCheckFailure(s"$caller requires boolean types, not $dt")
}
}

def checkForOrderingExpr(dt: DataType, caller: String): TypeCheckResult = {
if (RowOrdering.isOrderable(dt)) {
TypeCheckResult.TypeCheckSuccess
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,8 @@ class ExpressionTypeCheckingSuite extends SparkFunSuite {
assertSuccess(Sum('stringField))
assertSuccess(Average('stringField))
assertSuccess(Min('arrayField))
assertSuccess(Every('booleanField))
assertSuccess(AnyAgg('booleanField))

assertError(Min('mapField), "min does not support ordering on type")
assertError(Max('mapField), "max does not support ordering on type")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.sql.catalyst.expressions.aggregate

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Literal}
import org.apache.spark.sql.types.BooleanType

class AnyTestSuite extends SparkFunSuite {
val input = AttributeReference("input", BooleanType, nullable = true)()
val evaluator = DeclarativeAggregateEvaluator(AnyAgg(input), Seq(input))

test("empty buffer") {
assert(evaluator.initialize() === InternalRow(false, false))
}

test("update") {
val result = evaluator.update(
InternalRow(true),
InternalRow(false),
InternalRow(true))
assert(result === InternalRow(true, true))
}

test("merge") {
// Empty merge
val p0 = evaluator.initialize()
assert(evaluator.merge(p0) === InternalRow(false, false))

// Single merge
val p1 = evaluator.update(InternalRow(true), InternalRow(true))
assert(evaluator.merge(p1) === InternalRow(true, true))

// Multiple merges.
val p2 = evaluator.update(InternalRow(false), InternalRow(null))
assert(evaluator.merge(p1, p2) === InternalRow(true, true))

// Empty partitions (p0 is empty)
assert(evaluator.merge(p0, p2) === InternalRow(false, true))
assert(evaluator.merge(p2, p1, p0) === InternalRow(true, true))
}

test("eval") {
// Null Eval
assert(evaluator.eval(InternalRow(true, false)) === InternalRow(null))
assert(evaluator.eval(InternalRow(false, false)) === InternalRow(null))

// Empty Eval
val p0 = evaluator.initialize()
assert(evaluator.eval(p0) === InternalRow(null))

// Update - Eval
val p1 = evaluator.update(InternalRow(true), InternalRow(null))
assert(evaluator.eval(p1) === InternalRow(true))

// Update - Merge - Eval
val p2 = evaluator.update(InternalRow(false), InternalRow(false))
val m1 = evaluator.merge(p0, p2)
assert(evaluator.eval(m1) === InternalRow(false))

// Update - Merge - Eval (empty partition at the end)
val m2 = evaluator.merge(p2, p1, p0)
assert(evaluator.eval(m2) === InternalRow(true))
}
}
Loading