addressed comments v4.1

apache · May 4, 2015 · a07c01e · a07c01e
1 parent ae9e01d
commit a07c01e
Show file tree

Hide file tree

Showing 3 changed files with 13 additions and 7 deletions.
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
@@ -915,7 +915,8 @@ def crosstab(self, col1, col2):
         Computes a pair-wise frequency table of the given columns. Also known as a contingency
         table. The number of distinct values for each column should be less than 1e4. The first
         column of each row will be the distinct values of `col1` and the column names will be the
-        distinct values of `col2`. Pairs that have no occurrences will have `null` as their counts.
+        distinct values of `col2`. The name of the first column will be `$col1_$col2`. Pairs that
+        have no occurrences will have `null` as their counts.
         :func:`DataFrame.crosstab` and :func:`DataFrameStatFunctions.crosstab` are aliases.
 
         :param col1: The name of the first column. Distinct items will make the first item of

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
@@ -67,8 +67,8 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * Computes a pair-wise frequency table of the given columns. Also known as a contingency table.
    * The number of distinct values for each column should be less than 1e4. The first
    * column of each row will be the distinct values of `col1` and the column names will be the
-   * distinct values of `col2`. Counts will be returned as `Long`s. Pairs that have no occurrences
-   * will have `null` as their counts.
+   * distinct values of `col2`. The name of the first column will be `$col1_$col2`. Counts will be
+   * returned as `Long`s. Pairs that have no occurrences will have `null` as their counts.
    *
    * @param col1 The name of the first column. Distinct items will make the first item of
    *             each row.

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
@@ -17,13 +17,14 @@
 
 package org.apache.spark.sql.execution.stat
 
-import org.apache.spark.sql.{Column, DataFrame, Row}
+import org.apache.spark.Logging
+import org.apache.spark.sql.{Column, DataFrame}
 import org.apache.spark.sql.catalyst.expressions.{GenericMutableRow, Cast}
 import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types._
 
-private[sql] object StatFunctions {
+private[sql] object StatFunctions extends Logging {
 
   /** Calculate the Pearson Correlation Coefficient for the given columns */
   private[sql] def pearsonCorrelation(df: DataFrame, cols: Seq[String]): Double = {
@@ -102,18 +103,22 @@ private[sql] object StatFunctions {
   private[sql] def crossTabulate(df: DataFrame, col1: String, col2: String): DataFrame = {
     val tableName = s"${col1}_$col2"
     val counts = df.groupBy(col1, col2).agg(col(col1), col(col2), count("*")).take(1e8.toInt)
+    if (counts.length == 1e8.toInt) {
+      logWarning("The maximum limit of 1e8 pairs have been collected, which may not be all of " +
+        "the pairs. Please try reducing the amount of distinct items in your columns.")
+    }
     // get the distinct values of column 2, so that we can make them the column names
     val distinctCol2 = counts.map(_.get(1)).distinct.zipWithIndex.toMap
     val columnSize = distinctCol2.size
     require(columnSize < 1e4, s"The number of distinct values for $col2, can't " +
       s"exceed 1e4. Currently $columnSize")
-    val table = counts.groupBy(_.get(0)).map { case (col1Items, rows) =>
+    val table = counts.groupBy(_.get(0)).map { case (col1Item, rows) =>
       val countsRow = new GenericMutableRow(columnSize + 1)
       rows.foreach { row =>
         countsRow.setLong(distinctCol2.get(row.get(1)).get + 1, row.getLong(2))
       }
       // the value of col1 is the first value, the rest are the counts
-      countsRow.setString(0, col1Items.toString)
+      countsRow.setString(0, col1Item.toString)
       countsRow
     }.toSeq
     val headerNames = distinctCol2.map(r => StructField(r._1.toString, LongType)).toSeq