Skip to content

Commit

Permalink
[SPARK-8972] [SQL] Incorrect result for rollup
Browse files Browse the repository at this point in the history
We don't support the complex expression keys in the rollup/cube, and we even will not report it if we have the complex group by keys, that will cause very confusing/incorrect result.

e.g. `SELECT key%100 FROM src GROUP BY key %100 with ROLLUP`

This PR adds an additional project during the analyzing for the complex GROUP BY keys, and that projection will be the child of `Expand`, so to `Expand`, the GROUP BY KEY are always the simple key(attribute names).

Author: Cheng Hao <hao.cheng@intel.com>

Closes #7343 from chenghao-intel/expand and squashes the following commits:

1ebbb59 [Cheng Hao] update the comment
827873f [Cheng Hao] update as feedback
34def69 [Cheng Hao] Add more unit test and comments
c695760 [Cheng Hao] fix bug of incorrect result for rollup
  • Loading branch information
chenghao-intel authored and yhuai committed Jul 16, 2015
1 parent ba33096 commit e272123
Show file tree
Hide file tree
Showing 8 changed files with 145 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -194,16 +194,52 @@ class Analyzer(
}

def apply(plan: LogicalPlan): LogicalPlan = plan transform {
case a if !a.childrenResolved => a // be sure all of the children are resolved.
case a: Cube =>
GroupingSets(bitmasks(a), a.groupByExprs, a.child, a.aggregations)
case a: Rollup =>
GroupingSets(bitmasks(a), a.groupByExprs, a.child, a.aggregations)
case x: GroupingSets =>
val gid = AttributeReference(VirtualColumn.groupingIdName, IntegerType, false)()
// We will insert another Projection if the GROUP BY keys contains the
// non-attribute expressions. And the top operators can references those
// expressions by its alias.
// e.g. SELECT key%5 as c1 FROM src GROUP BY key%5 ==>
// SELECT a as c1 FROM (SELECT key%5 AS a FROM src) GROUP BY a

// find all of the non-attribute expressions in the GROUP BY keys
val nonAttributeGroupByExpressions = new ArrayBuffer[Alias]()

// The pair of (the original GROUP BY key, associated attribute)
val groupByExprPairs = x.groupByExprs.map(_ match {
case e: NamedExpression => (e, e.toAttribute)
case other => {
val alias = Alias(other, other.toString)()
nonAttributeGroupByExpressions += alias // add the non-attributes expression alias
(other, alias.toAttribute)
}
})

// substitute the non-attribute expressions for aggregations.
val aggregation = x.aggregations.map(expr => expr.transformDown {
case e => groupByExprPairs.find(_._1.semanticEquals(e)).map(_._2).getOrElse(e)
}.asInstanceOf[NamedExpression])

// substitute the group by expressions.
val newGroupByExprs = groupByExprPairs.map(_._2)

val child = if (nonAttributeGroupByExpressions.length > 0) {
// insert additional projection if contains the
// non-attribute expressions in the GROUP BY keys
Project(x.child.output ++ nonAttributeGroupByExpressions, x.child)
} else {
x.child
}

Aggregate(
x.groupByExprs :+ VirtualColumn.groupingIdAttribute,
x.aggregations,
Expand(x.bitmasks, x.groupByExprs, gid, x.child))
newGroupByExprs :+ VirtualColumn.groupingIdAttribute,
aggregation,
Expand(x.bitmasks, newGroupByExprs, gid, child))
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
500 NULL 0
91 0 1
84 1 1
105 2 1
113 3 1
107 4 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
1 NULL -3 2
1 NULL -1 2
1 NULL 3 2
1 NULL 4 2
1 NULL 5 2
1 NULL 6 2
1 NULL 12 2
1 NULL 14 2
1 NULL 15 2
1 NULL 22 2
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
1 NULL -3 2
1 NULL -1 2
1 NULL 3 2
1 NULL 4 2
1 NULL 5 2
1 NULL 6 2
1 NULL 12 2
1 NULL 14 2
1 NULL 15 2
1 NULL 22 2
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
500 NULL 0
91 0 1
84 1 1
105 2 1
113 3 1
107 4 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
1 0 5 3
1 0 15 3
1 0 25 3
1 0 60 3
1 0 75 3
1 0 80 3
1 0 100 3
1 0 140 3
1 0 145 3
1 0 150 3
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
1 0 5 3
1 0 15 3
1 0 25 3
1 0 60 3
1 0 75 3
1 0 80 3
1 0 100 3
1 0 140 3
1 0 145 3
1 0 150 3
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,60 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
}
}

createQueryTest("SPARK-8976 Wrong Result for Rollup #1",
"""
SELECT count(*) AS cnt, key % 5,GROUPING__ID FROM src group by key%5 WITH ROLLUP
""".stripMargin)

createQueryTest("SPARK-8976 Wrong Result for Rollup #2",
"""
SELECT
count(*) AS cnt,
key % 5 as k1,
key-5 as k2,
GROUPING__ID as k3
FROM src group by key%5, key-5
WITH ROLLUP ORDER BY cnt, k1, k2, k3 LIMIT 10
""".stripMargin)

createQueryTest("SPARK-8976 Wrong Result for Rollup #3",
"""
SELECT
count(*) AS cnt,
key % 5 as k1,
key-5 as k2,
GROUPING__ID as k3
FROM (SELECT key, key%2, key - 5 FROM src) t group by key%5, key-5
WITH ROLLUP ORDER BY cnt, k1, k2, k3 LIMIT 10
""".stripMargin)

createQueryTest("SPARK-8976 Wrong Result for CUBE #1",
"""
SELECT count(*) AS cnt, key % 5,GROUPING__ID FROM src group by key%5 WITH CUBE
""".stripMargin)

createQueryTest("SPARK-8976 Wrong Result for CUBE #2",
"""
SELECT
count(*) AS cnt,
key % 5 as k1,
key-5 as k2,
GROUPING__ID as k3
FROM (SELECT key, key%2, key - 5 FROM src) t group by key%5, key-5
WITH CUBE ORDER BY cnt, k1, k2, k3 LIMIT 10
""".stripMargin)

createQueryTest("SPARK-8976 Wrong Result for GroupingSet",
"""
SELECT
count(*) AS cnt,
key % 5 as k1,
key-5 as k2,
GROUPING__ID as k3
FROM (SELECT key, key%2, key - 5 FROM src) t group by key%5, key-5
GROUPING SETS (key%5, key-5) ORDER BY cnt, k1, k2, k3 LIMIT 10
""".stripMargin)

createQueryTest("insert table with generator with column name",
"""
| CREATE TABLE gen_tmp (key Int);
Expand Down

0 comments on commit e272123

Please sign in to comment.