-
Notifications
You must be signed in to change notification settings - Fork 28.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-24371] [SQL] Added isInCollection in DataFrame API for Scala and Java. #21416
Changes from all commits
da10307
730b19b
6ff2806
286a468
1332406
fed2846
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,6 +17,7 @@ | |
|
||
package org.apache.spark.sql | ||
|
||
import scala.collection.JavaConverters._ | ||
import scala.language.implicitConversions | ||
|
||
import org.apache.spark.annotation.InterfaceStability | ||
|
@@ -786,6 +787,24 @@ class Column(val expr: Expression) extends Logging { | |
@scala.annotation.varargs | ||
def isin(list: Any*): Column = withExpr { In(expr, list.map(lit(_).expr)) } | ||
|
||
/** | ||
* A boolean expression that is evaluated to true if the value of this expression is contained | ||
* by the provided collection. | ||
* | ||
* @group expr_ops | ||
* @since 2.4.0 | ||
*/ | ||
def isInCollection(values: scala.collection.Iterable[_]): Column = isin(values.toSeq: _*) | ||
|
||
/** | ||
* A boolean expression that is evaluated to true if the value of this expression is contained | ||
* by the provided collection. | ||
* | ||
* @group java_expr_ops | ||
* @since 2.4.0 | ||
*/ | ||
def isInCollection(values: java.lang.Iterable[_]): Column = isInCollection(values.asScala) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not that we need it for sure, but in the past some of our Java APIs have been difficult to call from Java and I think that if were making an API designed to be called from Java it might make sense to have a test case for it in Java. Here I understand it's not super important, so just a suggestion. |
||
|
||
/** | ||
* SQL like expression. Returns a boolean column based on a SQL LIKE match. | ||
* | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,6 +17,10 @@ | |
|
||
package org.apache.spark.sql | ||
|
||
import java.util.Locale | ||
|
||
import scala.collection.JavaConverters._ | ||
|
||
import org.apache.hadoop.io.{LongWritable, Text} | ||
import org.apache.hadoop.mapreduce.lib.input.{TextInputFormat => NewTextInputFormat} | ||
import org.scalatest.Matchers._ | ||
|
@@ -390,11 +394,67 @@ class ColumnExpressionSuite extends QueryTest with SharedSQLContext { | |
checkAnswer(df.filter($"b".isin("z", "y")), | ||
df.collect().toSeq.filter(r => r.getString(1) == "z" || r.getString(1) == "y")) | ||
|
||
// Auto casting should work with mixture of different types in collections | ||
checkAnswer(df.filter($"a".isin(1.toShort, "2")), | ||
df.collect().toSeq.filter(r => r.getInt(0) == 1 || r.getInt(0) == 2)) | ||
checkAnswer(df.filter($"a".isin("3", 2.toLong)), | ||
df.collect().toSeq.filter(r => r.getInt(0) == 3 || r.getInt(0) == 2)) | ||
checkAnswer(df.filter($"a".isin(3, "1")), | ||
df.collect().toSeq.filter(r => r.getInt(0) == 3 || r.getInt(0) == 1)) | ||
|
||
val df2 = Seq((1, Seq(1)), (2, Seq(2)), (3, Seq(3))).toDF("a", "b") | ||
|
||
intercept[AnalysisException] { | ||
val e = intercept[AnalysisException] { | ||
df2.filter($"a".isin($"b")) | ||
} | ||
Seq("cannot resolve", "due to data type mismatch: Arguments must be same type but were") | ||
.foreach { s => | ||
assert(e.getMessage.toLowerCase(Locale.ROOT).contains(s.toLowerCase(Locale.ROOT))) | ||
} | ||
} | ||
|
||
test("isInCollection: Scala Collection") { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we simplify the test cases? you are just testing this api as a wrapper. you don't need to run so many queries for type coercion. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||
val df = Seq((1, "x"), (2, "y"), (3, "z")).toDF("a", "b") | ||
// Test with different types of collections | ||
checkAnswer(df.filter($"a".isInCollection(Seq(3, 1))), | ||
df.collect().toSeq.filter(r => r.getInt(0) == 3 || r.getInt(0) == 1)) | ||
checkAnswer(df.filter($"a".isInCollection(Seq(1, 2).toSet)), | ||
df.collect().toSeq.filter(r => r.getInt(0) == 1 || r.getInt(0) == 2)) | ||
checkAnswer(df.filter($"a".isInCollection(Seq(3, 2).toArray)), | ||
df.collect().toSeq.filter(r => r.getInt(0) == 3 || r.getInt(0) == 2)) | ||
checkAnswer(df.filter($"a".isInCollection(Seq(3, 1).toList)), | ||
df.collect().toSeq.filter(r => r.getInt(0) == 3 || r.getInt(0) == 1)) | ||
|
||
val df2 = Seq((1, Seq(1)), (2, Seq(2)), (3, Seq(3))).toDF("a", "b") | ||
|
||
val e = intercept[AnalysisException] { | ||
df2.filter($"a".isInCollection(Seq($"b"))) | ||
} | ||
Seq("cannot resolve", "due to data type mismatch: Arguments must be same type but were") | ||
.foreach { s => | ||
assert(e.getMessage.toLowerCase(Locale.ROOT).contains(s.toLowerCase(Locale.ROOT))) | ||
} | ||
} | ||
|
||
test("isInCollection: Java Collection") { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As stated up above, maybe this would make sense to do in Java, but your call. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I totally agree with you that we should have tests natively in Java instead of converting the types to Java in Scala and hope the best that it will work in Java. Let's do it in the followup PR. |
||
val df = Seq((1, "x"), (2, "y"), (3, "z")).toDF("a", "b") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same thing here. just run a single test case. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||
// Test with different types of collections | ||
checkAnswer(df.filter($"a".isInCollection(Seq(1, 2).asJava)), | ||
df.collect().toSeq.filter(r => r.getInt(0) == 1 || r.getInt(0) == 2)) | ||
checkAnswer(df.filter($"a".isInCollection(Seq(1, 2).toSet.asJava)), | ||
df.collect().toSeq.filter(r => r.getInt(0) == 1 || r.getInt(0) == 2)) | ||
checkAnswer(df.filter($"a".isInCollection(Seq(3, 1).toList.asJava)), | ||
df.collect().toSeq.filter(r => r.getInt(0) == 3 || r.getInt(0) == 1)) | ||
|
||
val df2 = Seq((1, Seq(1)), (2, Seq(2)), (3, Seq(3))).toDF("a", "b") | ||
|
||
val e = intercept[AnalysisException] { | ||
df2.filter($"a".isInCollection(Seq($"b").asJava)) | ||
} | ||
Seq("cannot resolve", "due to data type mismatch: Arguments must be same type but were") | ||
.foreach { s => | ||
assert(e.getMessage.toLowerCase(Locale.ROOT).contains(s.toLowerCase(Locale.ROOT))) | ||
} | ||
} | ||
|
||
test("&&") { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I know this is not your change, but I think (both here and bellow) something about the automagical type casting thats going on should be in the docstring/scaladoc/javadoc because to me its a little surprising how this will compare integers to strings and silently convert the types including if there are no strings which can be converted to integers. And I'd also include that in the isInCollection docstring/scaladoc/javadoc bellow.
I'd also point out that the result of the conversion needs to be of the same type and not of a sequence of the type (although the error message we get is pretty clear so your call).
Just a suggestion for improvement.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
+1 Let's do it in the followup PR. Thanks.