Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-37171][SQL]Add forany and forall to Datasets/Dataframes #34450

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
72 changes: 72 additions & 0 deletions sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
Original file line number Diff line number Diff line change
Expand Up @@ -1638,6 +1638,58 @@ class Dataset[T] private[sql](
c5: TypedColumn[T, U5]): Dataset[(U1, U2, U3, U4, U5)] =
selectUntyped(c1, c2, c3, c4, c5).asInstanceOf[Dataset[(U1, U2, U3, U4, U5)]]

/**
* Checks if the condition is true for any of the rows.
* {{{
* peopleDs.forany($"age" > 15)
* }}}
*
* @group typedrel
* @since 3.2.0
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can't be since 3.2.0
This would have to be in R, Scala APIs too
But this doesn't seem worth it - it isn't faster to execute

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That’s true. So do you recommend we drop this as it would be too much to implement in R and Python?

I believe the code is for Scala API.
Just FYI, isEmpty method is absent in Python API although present in Scala API.

Let me know if we still wanna go forward, I can change the version then. If you let me know the right version.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry yes I meant Python/R. I don't think this is worthwhile if it isn't optimizing or saving the caller much code

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No problems :)

i will close the PR

*/
def forany(condition: Column): Boolean = {
!filter(condition).isEmpty
}

/**
* Checks if the SQL expression is true for any of the rows.
* {{{
* peopleDs.forany("age > 15")
* }}}
*
* @group typedrel
* @since 3.2.0
*/
def forany(conditionExpr: String): Boolean = {
!filter(conditionExpr).isEmpty
}

/**
* Checks if the condition is true for all the rows.
* {{{
* peopleDs.forall($"age" > 15)
* }}}
*
* @group typedrel
* @since 3.2.0
*/
def forall(condition: Column): Boolean = {
filter(!condition).isEmpty
}

/**
* Checks if the SQL expression is true for all the rows.
* {{{
* peopleDs.forall("age > 15")
* }}}
*
* @group typedrel
* @since 3.2.0
*/
def forall(conditionExpr: String): Boolean = {
filter(!Column(sparkSession.sessionState.sqlParser.parseExpression(conditionExpr))).isEmpty
}

/**
* Filters rows using the given condition.
* {{{
Expand Down Expand Up @@ -2860,6 +2912,26 @@ class Dataset[T] private[sql](
*/
def transform[U](t: Dataset[T] => Dataset[U]): Dataset[U] = t(this)

/**
* Checks if there are any elements for which `func` returns `true`.
*
* @group typedrel
* @since 3.2.0
*/
def forany(func: T => Boolean): Boolean = {
!filter(func).isEmpty
}

/**
* Checks if `func` returns `true` for all elements.
*
* @group typedrel
* @since 3.2.0
*/
def forall(func: T => Boolean): Boolean = {
filter((x: T) => !func(x)).isEmpty
}

/**
* (Scala-specific)
* Returns a new Dataset that only contains elements where `func` returns `true`.
Expand Down
60 changes: 60 additions & 0 deletions sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,66 @@ class DatasetSuite extends QueryTest
}
}

test("SPARK-37171: forany returns true if condition is true for any row") {
val df = Seq("aa", "bb", "cc", "abc").toDF("zoo")
assert(df.forany($"zoo".contains(Array('a', 'b'))))
}

test("SPARK-37171: forall returns true if condition true for all rows") {
val df = Seq("ab", "ba").toDF("zoo")
assert(df.forall($"zoo".contains("a")))
}

test("SPARK-37171: forany returns false if condition is false for all rows") {
val df = Seq("aa", "bb", "cc").toDF("zoo")
assert(!df.forany($"zoo".contains(Array('a', 'b'))))
}

test("SPARK-37171: forall returns false if condition false for any rows") {
val df = Seq("ab", "ba").toDF("zoo")
assert(!df.forall($"zoo".contains("c")))
}

test("SPARK-37171: forany expression returns true if condition is true for any row") {
val df = Seq("aa", "bb", "cc", "abc").toDF("zoo")
assert(df.forany("zoo like 'ab%'"))
}

test("SPARK-37171: forall expressions returns true if condition true for all rows") {
val df = Seq("ab", "ba").toDF("zoo")
assert(df.forall("zoo like '%a%'"))
}

test("SPARK-37171: forany expressions returns false if condition is false for all rows") {
val df = Seq("aa", "bb", "cc").toDF("zoo")
assert(!df.forany("zoo like '%ab%'"))
}

test("SPARK-37171: forall expressions returns false if condition false for any rows") {
val df = Seq("ab", "ba").toDF("zoo")
assert(!df.forall("zoo like '%c%'"))
}

test("SPARK-37171: forany function returns true if condition is true for any row") {
val ds = Seq(("a", 1), ("b", 2), ("c", 3)).toDS()
assert(ds.forany(_._1 == "b"))
}

test("SPARK-37171: forall function returns true if condition is true for all rows") {
val ds = Seq(("a", 1), ("a", 1), ("a", 2)).toDS()
assert(ds.forall(_._1 == "a"))
}

test("SPARK-37171: forany function returns false if condition is false for all rows") {
val ds = Seq(("a", 1), ("b", 2), ("c", 3)).toDS()
assert(!ds.forany(_._1 == "d"))
}

test("SPARK-37171: forall function returns false if condition is false for any row") {
val ds = Seq(("a", 1), ("a", 1), ("a", 2)).toDS()
assert(!ds.forall(_._2 == 2))
}

test("filter") {
val ds = Seq(("a", 1), ("b", 2), ("c", 3)).toDS()
checkDataset(
Expand Down