From a417ba5ec0f0e04e66b5de88dcbc1cead8fe937c Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Mon, 4 May 2015 19:53:09 -0700
Subject: [PATCH 1/3] [SPARK-7243][SQL] Reduce  size for Contingency Tables in
 DataFrames

---
 .../org/apache/spark/sql/execution/stat/StatFunctions.scala | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
index b50f606d9cbe3..386ac969f1e7d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
@@ -102,9 +102,9 @@ private[sql] object StatFunctions extends Logging {
   /** Generate a table of frequencies for the elements of two columns. */
   private[sql] def crossTabulate(df: DataFrame, col1: String, col2: String): DataFrame = {
     val tableName = s"${col1}_$col2"
-    val counts = df.groupBy(col1, col2).agg(col(col1), col(col2), count("*")).take(1e8.toInt)
-    if (counts.length == 1e8.toInt) {
-      logWarning("The maximum limit of 1e8 pairs have been collected, which may not be all of " +
+    val counts = df.groupBy(col1, col2).agg(col(col1), col(col2), count("*")).take(1e6.toInt)
+    if (counts.length == 1e6.toInt) {
+      logWarning("The maximum limit of 1e6 pairs have been collected, which may not be all of " +
         "the pairs. Please try reducing the amount of distinct items in your columns.")
     }
     // get the distinct values of column 2, so that we can make them the column names

From b30ace27be6e26ae536215a165d9db2fdde1fcf8 Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Mon, 4 May 2015 22:33:36 -0700
Subject: [PATCH 2/3] address comments

---
 python/pyspark/sql/dataframe.py                          | 9 +++++----
 .../org/apache/spark/sql/DataFrameStatFunctions.scala    | 9 +++++----
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index f30a92dfc8534..fca45945a1e66 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -934,10 +934,11 @@ def cov(self, col1, col2):
     def crosstab(self, col1, col2):
         """
         Computes a pair-wise frequency table of the given columns. Also known as a contingency
-        table. The number of distinct values for each column should be less than 1e4. The first
-        column of each row will be the distinct values of `col1` and the column names will be the
-        distinct values of `col2`. The name of the first column will be `$col1_$col2`. Pairs that
-        have no occurrences will have `null` as their counts.
+        table. The number of distinct values for each column should be less than 1e4. At most, 1e6
+        non-zero pair frequencies returned will be returned.
+        The first column of each row will be the distinct values of `col1` and the column names
+        will be the distinct values of `col2`. The name of the first column will be `$col1_$col2`.
+        Pairs that have no occurrences will have `null` as their counts.
         :func:`DataFrame.crosstab` and :func:`DataFrameStatFunctions.crosstab` are aliases.
 
         :param col1: The name of the first column. Distinct items will make the first item of
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
index fcf21ca741a7c..e864f76fb39bd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
@@ -65,10 +65,11 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
 
   /**
    * Computes a pair-wise frequency table of the given columns. Also known as a contingency table.
-   * The number of distinct values for each column should be less than 1e4. The first
-   * column of each row will be the distinct values of `col1` and the column names will be the
-   * distinct values of `col2`. The name of the first column will be `$col1_$col2`. Counts will be
-   * returned as `Long`s. Pairs that have no occurrences will have `null` as their counts.
+   * The number of distinct values for each column should be less than 1e4. At most, 1e6 non-zero
+   * pair frequencies returned will be returned.
+   * The first column of each row will be the distinct values of `col1` and the column names will
+   * be the distinct values of `col2`. The name of the first column will be `$col1_$col2`. Counts
+   * will be returned as `Long`s. Pairs that have no occurrences will have `null` as their counts.
    *
    * @param col1 The name of the first column. Distinct items will make the first item of
    *             each row.

From c11e76254f5ad1c48a248abcf2adddd9b919ae55 Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Tue, 5 May 2015 08:23:48 -0700
Subject: [PATCH 3/3] fix grammar

---
 python/pyspark/sql/dataframe.py                               | 4 ++--
 .../scala/org/apache/spark/sql/DataFrameStatFunctions.scala   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index fca45945a1e66..17448b38c30a1 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -934,8 +934,8 @@ def cov(self, col1, col2):
     def crosstab(self, col1, col2):
         """
         Computes a pair-wise frequency table of the given columns. Also known as a contingency
-        table. The number of distinct values for each column should be less than 1e4. At most, 1e6
-        non-zero pair frequencies returned will be returned.
+        table. The number of distinct values for each column should be less than 1e4. At most 1e6
+        non-zero pair frequencies will be returned.
         The first column of each row will be the distinct values of `col1` and the column names
         will be the distinct values of `col2`. The name of the first column will be `$col1_$col2`.
         Pairs that have no occurrences will have `null` as their counts.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
index e864f76fb39bd..cb88deab35968 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
@@ -65,8 +65,8 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
 
   /**
    * Computes a pair-wise frequency table of the given columns. Also known as a contingency table.
-   * The number of distinct values for each column should be less than 1e4. At most, 1e6 non-zero
-   * pair frequencies returned will be returned.
+   * The number of distinct values for each column should be less than 1e4. At most 1e6 non-zero
+   * pair frequencies will be returned.
    * The first column of each row will be the distinct values of `col1` and the column names will
    * be the distinct values of `col2`. The name of the first column will be `$col1_$col2`. Counts
    * will be returned as `Long`s. Pairs that have no occurrences will have `null` as their counts.