-
Notifications
You must be signed in to change notification settings - Fork 28.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-12720] [SQL] SQL Generation Support for Cube, Rollup, and Grouping Sets #11283
Changes from 45 commits
01e4cdf
6835704
9180687
b38a21e
d2b84af
fda8025
ac0dccd
6e0018b
0546772
b37a64f
c2a872c
ab6dbd7
4276356
2dab708
0458770
1debdfa
763706d
4de6ec1
9422a4f
52bdf48
1e95df3
fab24cf
8b2e33b
2ee1876
b9f0090
ade6f7e
9fd63d2
bc0c030
9292abe
5199d49
ea5c567
a408325
3514213
6a36593
404214c
bc5e96f
1aef849
c001dd9
9b1d420
6f79df1
37a9d8d
640e45c
749be1b
ae768fe
6cea658
5d8923c
b1925e5
6f609fb
9eaca51
b8786b2
59daa48
385c0d9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,13 +24,13 @@ import scala.util.control.NonFatal | |
import org.apache.spark.Logging | ||
import org.apache.spark.sql.{DataFrame, SQLContext} | ||
import org.apache.spark.sql.catalyst.TableIdentifier | ||
import org.apache.spark.sql.catalyst.expressions.{Attribute, NamedExpression, NonSQLExpression, | ||
SortOrder} | ||
import org.apache.spark.sql.catalyst.expressions._ | ||
import org.apache.spark.sql.catalyst.optimizer.CollapseProject | ||
import org.apache.spark.sql.catalyst.plans.logical._ | ||
import org.apache.spark.sql.catalyst.rules.{Rule, RuleExecutor} | ||
import org.apache.spark.sql.catalyst.util.quoteIdentifier | ||
import org.apache.spark.sql.execution.datasources.LogicalRelation | ||
import org.apache.spark.sql.types.{ByteType, IntegerType} | ||
|
||
/** | ||
* A builder class used to convert a resolved logical plan into a SQL query string. Note that this | ||
|
@@ -107,6 +107,11 @@ class SQLBuilder(logicalPlan: LogicalPlan, sqlContext: SQLContext) extends Loggi | |
case p: Project => | ||
projectToSQL(p, isDistinct = false) | ||
|
||
case a @ Aggregate(_, _, e @ Expand(_, _, p: Project)) | ||
if sameOutput(e.output, | ||
p.child.output ++ a.groupingExpressions.map(_.asInstanceOf[Attribute])) => | ||
groupingSetToSQL(a, e, p) | ||
|
||
case p: Aggregate => | ||
aggregateToSQL(p) | ||
|
||
|
@@ -203,6 +208,10 @@ class SQLBuilder(logicalPlan: LogicalPlan, sqlContext: SQLContext) extends Loggi | |
throw new UnsupportedOperationException(s"unsupported plan $node") | ||
} | ||
|
||
private def sameOutput(left: Seq[Attribute], right: Seq[Attribute]): Boolean = | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you! |
||
left.forall(a => right.exists(_.semanticEquals(a))) && | ||
right.forall(a => left.exists(_.semanticEquals(a))) | ||
|
||
/** | ||
* Turns a bunch of string segments into a single string and separate each segment by a space. | ||
* The segments are trimmed so only a single space appears in the separation. | ||
|
@@ -233,6 +242,67 @@ class SQLBuilder(logicalPlan: LogicalPlan, sqlContext: SQLContext) extends Loggi | |
) | ||
} | ||
|
||
private def groupingSetToSQL( | ||
plan: Aggregate, | ||
expand: Expand, | ||
project: Project): String = { | ||
require(plan.groupingExpressions.length > 1) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: I think There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will do. Thanks! |
||
|
||
// The last column of Expand is always grouping ID | ||
val gid = expand.output.last | ||
|
||
val groupByAttributes = plan.groupingExpressions.dropRight(1).map(_.asInstanceOf[Attribute]) | ||
val groupByExprs = | ||
project.projectList.drop(project.child.output.length).map(_.asInstanceOf[Alias].child) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: we can define |
||
val groupByAttrMap = AttributeMap(groupByAttributes.zip(groupByExprs)) | ||
val groupingSQL = groupByExprs.map(_.sql).mkString(", ") | ||
|
||
val groupingSet = expand.projections.map { project => | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: Would be nice to add type annotation to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will do. |
||
// Assumption: expand.projections are composed of | ||
// 1) the original output (Project's child.output), | ||
// 2) group by attributes(or null literal) | ||
// 3) gid, which is always the last one in each project in Expand | ||
project.dropRight(1).collect { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add comments to explain our assumption here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will do. Thanks! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: |
||
case attr: Attribute if groupByAttrMap.contains(attr) => groupByAttrMap(attr) | ||
} | ||
} | ||
val groupingSetSQL = | ||
"GROUPING SETS(" + | ||
groupingSet.map(e => s"(${e.map(_.sql).mkString(", ")})").mkString(", ") + ")" | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can the above be simplified to:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The groupingExpressions could be expressions. For example, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. uh, you are right. Let me change it. : ) |
||
val aggExprs = plan.aggregateExpressions.map { case expr => | ||
expr.transformDown { | ||
// grouping_id() is converted to VirtualColumn.groupingIdName by Analyzer. Revert it back. | ||
case ar: AttributeReference if ar eq gid => GroupingID(Nil) | ||
case a @ Alias(_ @ Cast(BitwiseAnd( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we also remove this There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, let me remove it. Thanks! |
||
ShiftRight(ar: AttributeReference, _ @ Literal(value: Any, IntegerType)), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: Why the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let me remove it. |
||
Literal(1, IntegerType)), ByteType), name) if ar == gid => | ||
// for converting an expression to its original SQL format grouping(col) | ||
val idx = groupByExprs.length - 1 - value.asInstanceOf[Int] | ||
val groupingCol = groupByExprs.lift(idx) | ||
if (groupingCol.isDefined) { | ||
Grouping(groupingCol.get) | ||
} else { | ||
throw new UnsupportedOperationException(s"unsupported operator $a") | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The following version might be clearer: val groupingCol = groupByExprs.applyOrElse(
idx, throw new UnsupportedOperationException(s"unsupported operator $a")
Grouping(groupingCol) And I don't quite get the meaning of the exception error message... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here, if the value is out of boundary, I thought we should not continue the conversion. After rethinking this, users might call grouping_id() inside such a function. Maybe we should not throw any exception. How about changing it to groupByExprs.lift(idx).map(Grouping).getOrElse(a) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Makes sense. |
||
case a @ Alias(ar: AttributeReference, _) if groupByAttrMap.contains(ar) => | ||
groupByAttrMap(ar) | ||
case ar: AttributeReference if groupByAttrMap.contains(ar) => | ||
groupByAttrMap(ar) | ||
} | ||
} | ||
|
||
build( | ||
"SELECT", | ||
aggExprs.map(_.sql).mkString(", "), | ||
if (plan.child == OneRowRelation) "" else "FROM", | ||
toSQL(project.child), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's add some test cases where There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will do. Thanks! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added another two test cases for this. Now, we have three |
||
"GROUP BY", | ||
groupingSQL, | ||
groupingSetSQL | ||
) | ||
} | ||
|
||
object Canonicalizer extends RuleExecutor[LogicalPlan] { | ||
override protected def batches: Seq[Batch] = Seq( | ||
Batch("Canonicalizer", FixedPoint(100), | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry misses this: should check
isInstanceOf
before callingasInstanceOf
directly.We can put all of it in one method and use it as if condition.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you for pointing it out! I will be more careful next time.