From eadbfd93eb223faa1f95ba77c14e3b29361ef14f Mon Sep 17 00:00:00 2001 From: dhirennavani Date: Sat, 30 Oct 2021 20:48:24 -0700 Subject: [PATCH] [SPARK-37171][SQL]Add forany and forall to Datasets/Dataframes Add forany and forall api for Dataframe/Datasets API To provide a higher level of abstraction for Spark customers Yes, forany and forall methods are added to the Dataframe/Dataset API Added new unit tests --- .../scala/org/apache/spark/sql/Dataset.scala | 72 +++++++++++++++++++ .../org/apache/spark/sql/DatasetSuite.scala | 60 ++++++++++++++++ 2 files changed, 132 insertions(+) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index c8cdc209a2464..b207115be039a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -1638,6 +1638,58 @@ class Dataset[T] private[sql]( c5: TypedColumn[T, U5]): Dataset[(U1, U2, U3, U4, U5)] = selectUntyped(c1, c2, c3, c4, c5).asInstanceOf[Dataset[(U1, U2, U3, U4, U5)]] + /** + * Checks if the condition is true for any of the rows. + * {{{ + * peopleDs.forany($"age" > 15) + * }}} + * + * @group typedrel + * @since 3.2.0 + */ + def forany(condition: Column): Boolean = { + !filter(condition).isEmpty + } + + /** + * Checks if the SQL expression is true for any of the rows. + * {{{ + * peopleDs.forany("age > 15") + * }}} + * + * @group typedrel + * @since 3.2.0 + */ + def forany(conditionExpr: String): Boolean = { + !filter(conditionExpr).isEmpty + } + + /** + * Checks if the condition is true for all the rows. + * {{{ + * peopleDs.forall($"age" > 15) + * }}} + * + * @group typedrel + * @since 3.2.0 + */ + def forall(condition: Column): Boolean = { + filter(!condition).isEmpty + } + + /** + * Checks if the SQL expression is true for all the rows. + * {{{ + * peopleDs.forall("age > 15") + * }}} + * + * @group typedrel + * @since 3.2.0 + */ + def forall(conditionExpr: String): Boolean = { + filter(!Column(sparkSession.sessionState.sqlParser.parseExpression(conditionExpr))).isEmpty + } + /** * Filters rows using the given condition. * {{{ @@ -2860,6 +2912,26 @@ class Dataset[T] private[sql]( */ def transform[U](t: Dataset[T] => Dataset[U]): Dataset[U] = t(this) + /** + * Checks if there are any elements for which `func` returns `true`. + * + * @group typedrel + * @since 3.2.0 + */ + def forany(func: T => Boolean): Boolean = { + !filter(func).isEmpty + } + + /** + * Checks if `func` returns `true` for all elements. + * + * @group typedrel + * @since 3.2.0 + */ + def forall(func: T => Boolean): Boolean = { + filter((x: T) => !func(x)).isEmpty + } + /** * (Scala-specific) * Returns a new Dataset that only contains elements where `func` returns `true`. diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala index a46ef5d8e9cdb..44976ac44d562 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala @@ -395,6 +395,66 @@ class DatasetSuite extends QueryTest } } + test("SPARK-37171: forany returns true if condition is true for any row") { + val df = Seq("aa", "bb", "cc", "abc").toDF("zoo") + assert(df.forany($"zoo".contains(Array('a', 'b')))) + } + + test("SPARK-37171: forall returns true if condition true for all rows") { + val df = Seq("ab", "ba").toDF("zoo") + assert(df.forall($"zoo".contains("a"))) + } + + test("SPARK-37171: forany returns false if condition is false for all rows") { + val df = Seq("aa", "bb", "cc").toDF("zoo") + assert(!df.forany($"zoo".contains(Array('a', 'b')))) + } + + test("SPARK-37171: forall returns false if condition false for any rows") { + val df = Seq("ab", "ba").toDF("zoo") + assert(!df.forall($"zoo".contains("c"))) + } + + test("SPARK-37171: forany expression returns true if condition is true for any row") { + val df = Seq("aa", "bb", "cc", "abc").toDF("zoo") + assert(df.forany("zoo like 'ab%'")) + } + + test("SPARK-37171: forall expressions returns true if condition true for all rows") { + val df = Seq("ab", "ba").toDF("zoo") + assert(df.forall("zoo like '%a%'")) + } + + test("SPARK-37171: forany expressions returns false if condition is false for all rows") { + val df = Seq("aa", "bb", "cc").toDF("zoo") + assert(!df.forany("zoo like '%ab%'")) + } + + test("SPARK-37171: forall expressions returns false if condition false for any rows") { + val df = Seq("ab", "ba").toDF("zoo") + assert(!df.forall("zoo like '%c%'")) + } + + test("SPARK-37171: forany function returns true if condition is true for any row") { + val ds = Seq(("a", 1), ("b", 2), ("c", 3)).toDS() + assert(ds.forany(_._1 == "b")) + } + + test("SPARK-37171: forall function returns true if condition is true for all rows") { + val ds = Seq(("a", 1), ("a", 1), ("a", 2)).toDS() + assert(ds.forall(_._1 == "a")) + } + + test("SPARK-37171: forany function returns false if condition is false for all rows") { + val ds = Seq(("a", 1), ("b", 2), ("c", 3)).toDS() + assert(!ds.forany(_._1 == "d")) + } + + test("SPARK-37171: forall function returns false if condition is false for any row") { + val ds = Seq(("a", 1), ("a", 1), ("a", 2)).toDS() + assert(!ds.forall(_._2 == 2)) + } + test("filter") { val ds = Seq(("a", 1), ("b", 2), ("c", 3)).toDS() checkDataset(