From aa809e39baf222e698315a5efb2d583cab99aad7 Mon Sep 17 00:00:00 2001 From: Patrick Woody Date: Wed, 1 Nov 2017 11:44:51 -0400 Subject: [PATCH] SPARK-22408: reduce work of calculating pivot distinct values --- .../scala/org/apache/spark/sql/RelationalGroupedDataset.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala index 21e94fa8bb0b1..3e4edd4ea8cf3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala @@ -321,10 +321,10 @@ class RelationalGroupedDataset protected[sql]( // Get the distinct values of the column and sort them so its consistent val values = df.select(pivotColumn) .distinct() + .limit(maxValues + 1) .sort(pivotColumn) // ensure that the output columns are in a consistent logical order - .rdd + .collect() .map(_.get(0)) - .take(maxValues + 1) .toSeq if (values.length > maxValues) {