[SPARK-7243][SQL] Reduce size for Contingency Tables in DataFrames

apache · May 5, 2015 · a417ba5 · a417ba5
1 parent 8aa5aea
commit a417ba5
Showing 1 changed file with 3 additions and 3 deletions.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
@@ -102,9 +102,9 @@ private[sql] object StatFunctions extends Logging {
   /** Generate a table of frequencies for the elements of two columns. */
   private[sql] def crossTabulate(df: DataFrame, col1: String, col2: String): DataFrame = {
     val tableName = s"${col1}_$col2"
-    val counts = df.groupBy(col1, col2).agg(col(col1), col(col2), count("*")).take(1e8.toInt)
-    if (counts.length == 1e8.toInt) {
-      logWarning("The maximum limit of 1e8 pairs have been collected, which may not be all of " +
+    val counts = df.groupBy(col1, col2).agg(col(col1), col(col2), count("*")).take(1e6.toInt)
+    if (counts.length == 1e6.toInt) {
+      logWarning("The maximum limit of 1e6 pairs have been collected, which may not be all of " +
         "the pairs. Please try reducing the amount of distinct items in your columns.")
     }
     // get the distinct values of column 2, so that we can make them the column names