New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-7462] By default retain group by columns in aggregate #5996
Changes from all commits
1e6e666
d910141
b8b87e1
c1de670
5f923c0
f6858f6
aac7119
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -74,6 +74,9 @@ private[spark] object SQLConf { | |
// See SPARK-6231. | ||
val DATAFRAME_SELF_JOIN_AUTO_RESOLVE_AMBIGUITY = "spark.sql.selfJoinAutoResolveAmbiguity" | ||
|
||
// Whether to retain group by columns or not in GroupedData.agg. | ||
val DATAFRAME_RETAIN_GROUP_COLUMNS = "spark.sql.retainGroupColumns" | ||
|
||
val USE_SQL_SERIALIZER2 = "spark.sql.useSerializer2" | ||
|
||
val USE_JACKSON_STREAMING_API = "spark.sql.json.useJacksonStreamingAPI" | ||
|
@@ -242,6 +245,9 @@ private[sql] class SQLConf extends Serializable with CatalystConf { | |
|
||
private[spark] def dataFrameSelfJoinAutoResolveAmbiguity: Boolean = | ||
getConf(DATAFRAME_SELF_JOIN_AUTO_RESOLVE_AMBIGUITY, "true").toBoolean | ||
|
||
private[spark] def dataFrameRetainGroupColumns: Boolean = | ||
getConf(DATAFRAME_RETAIN_GROUP_COLUMNS, "true").toBoolean | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Increasingly wondering if dataframe flags should be scoped (eager analysis affects There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's talk more about this. if we want to do it, we should do it in 1.4. |
||
|
||
/** ********************** SQLConf functionality methods ************ */ | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,193 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.spark.sql | ||
|
||
import org.apache.spark.sql.TestData._ | ||
import org.apache.spark.sql.functions._ | ||
import org.apache.spark.sql.test.TestSQLContext | ||
import org.apache.spark.sql.test.TestSQLContext.implicits._ | ||
import org.apache.spark.sql.types.DecimalType | ||
|
||
|
||
class DataFrameAggregateSuite extends QueryTest { | ||
|
||
test("groupBy") { | ||
checkAnswer( | ||
testData2.groupBy("a").agg(sum($"b")), | ||
Seq(Row(1, 3), Row(2, 3), Row(3, 3)) | ||
) | ||
checkAnswer( | ||
testData2.groupBy("a").agg(sum($"b").as("totB")).agg(sum('totB)), | ||
Row(9) | ||
) | ||
checkAnswer( | ||
testData2.groupBy("a").agg(count("*")), | ||
Row(1, 2) :: Row(2, 2) :: Row(3, 2) :: Nil | ||
) | ||
checkAnswer( | ||
testData2.groupBy("a").agg(Map("*" -> "count")), | ||
Row(1, 2) :: Row(2, 2) :: Row(3, 2) :: Nil | ||
) | ||
checkAnswer( | ||
testData2.groupBy("a").agg(Map("b" -> "sum")), | ||
Row(1, 3) :: Row(2, 3) :: Row(3, 3) :: Nil | ||
) | ||
|
||
val df1 = Seq(("a", 1, 0, "b"), ("b", 2, 4, "c"), ("a", 2, 3, "d")) | ||
.toDF("key", "value1", "value2", "rest") | ||
|
||
checkAnswer( | ||
df1.groupBy("key").min(), | ||
df1.groupBy("key").min("value1", "value2").collect() | ||
) | ||
checkAnswer( | ||
df1.groupBy("key").min("value2"), | ||
Seq(Row("a", 0), Row("b", 4)) | ||
) | ||
} | ||
|
||
test("spark.sql.retainGroupColumns config") { | ||
checkAnswer( | ||
testData2.groupBy("a").agg(sum($"b")), | ||
Seq(Row(1, 3), Row(2, 3), Row(3, 3)) | ||
) | ||
|
||
TestSQLContext.conf.setConf("spark.sql.retainGroupColumns", "false") | ||
checkAnswer( | ||
testData2.groupBy("a").agg(sum($"b")), | ||
Seq(Row(3), Row(3), Row(3)) | ||
) | ||
TestSQLContext.conf.setConf("spark.sql.retainGroupColumns", "true") | ||
} | ||
|
||
test("agg without groups") { | ||
checkAnswer( | ||
testData2.agg(sum('b)), | ||
Row(9) | ||
) | ||
} | ||
|
||
test("average") { | ||
checkAnswer( | ||
testData2.agg(avg('a)), | ||
Row(2.0)) | ||
|
||
// Also check mean | ||
checkAnswer( | ||
testData2.agg(mean('a)), | ||
Row(2.0)) | ||
|
||
checkAnswer( | ||
testData2.agg(avg('a), sumDistinct('a)), // non-partial | ||
Row(2.0, 6.0) :: Nil) | ||
|
||
checkAnswer( | ||
decimalData.agg(avg('a)), | ||
Row(new java.math.BigDecimal(2.0))) | ||
checkAnswer( | ||
decimalData.agg(avg('a), sumDistinct('a)), // non-partial | ||
Row(new java.math.BigDecimal(2.0), new java.math.BigDecimal(6)) :: Nil) | ||
|
||
checkAnswer( | ||
decimalData.agg(avg('a cast DecimalType(10, 2))), | ||
Row(new java.math.BigDecimal(2.0))) | ||
// non-partial | ||
checkAnswer( | ||
decimalData.agg(avg('a cast DecimalType(10, 2)), sumDistinct('a cast DecimalType(10, 2))), | ||
Row(new java.math.BigDecimal(2.0), new java.math.BigDecimal(6)) :: Nil) | ||
} | ||
|
||
test("null average") { | ||
checkAnswer( | ||
testData3.agg(avg('b)), | ||
Row(2.0)) | ||
|
||
checkAnswer( | ||
testData3.agg(avg('b), countDistinct('b)), | ||
Row(2.0, 1)) | ||
|
||
checkAnswer( | ||
testData3.agg(avg('b), sumDistinct('b)), // non-partial | ||
Row(2.0, 2.0)) | ||
} | ||
|
||
test("zero average") { | ||
val emptyTableData = Seq.empty[(Int, Int)].toDF("a", "b") | ||
checkAnswer( | ||
emptyTableData.agg(avg('a)), | ||
Row(null)) | ||
|
||
checkAnswer( | ||
emptyTableData.agg(avg('a), sumDistinct('b)), // non-partial | ||
Row(null, null)) | ||
} | ||
|
||
test("count") { | ||
assert(testData2.count() === testData2.map(_ => 1).count()) | ||
|
||
checkAnswer( | ||
testData2.agg(count('a), sumDistinct('a)), // non-partial | ||
Row(6, 6.0)) | ||
} | ||
|
||
test("null count") { | ||
checkAnswer( | ||
testData3.groupBy('a).agg(count('b)), | ||
Seq(Row(1,0), Row(2, 1)) | ||
) | ||
|
||
checkAnswer( | ||
testData3.groupBy('a).agg(count('a + 'b)), | ||
Seq(Row(1,0), Row(2, 1)) | ||
) | ||
|
||
checkAnswer( | ||
testData3.agg(count('a), count('b), count(lit(1)), countDistinct('a), countDistinct('b)), | ||
Row(2, 1, 2, 2, 1) | ||
) | ||
|
||
checkAnswer( | ||
testData3.agg(count('b), countDistinct('b), sumDistinct('b)), // non-partial | ||
Row(1, 1, 2) | ||
) | ||
} | ||
|
||
test("zero count") { | ||
val emptyTableData = Seq.empty[(Int, Int)].toDF("a", "b") | ||
assert(emptyTableData.count() === 0) | ||
|
||
checkAnswer( | ||
emptyTableData.agg(count('a), sumDistinct('a)), // non-partial | ||
Row(0, null)) | ||
} | ||
|
||
test("zero sum") { | ||
val emptyTableData = Seq.empty[(Int, Int)].toDF("a", "b") | ||
checkAnswer( | ||
emptyTableData.agg(sum('a)), | ||
Row(null)) | ||
} | ||
|
||
test("zero sum distinct") { | ||
val emptyTableData = Seq.empty[(Int, Int)].toDF("a", "b") | ||
checkAnswer( | ||
emptyTableData.agg(sumDistinct('a)), | ||
Row(null)) | ||
} | ||
|
||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
how about
retainedExprs
andaggExprs
have the same expressions, should we distinct them?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i'm not sure if we should do that, since in dataframe you can technically duplicate columns.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes, then i think this is ok, distinct them may make users confused.