diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index f30a92dfc8534..fca45945a1e66 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -934,10 +934,11 @@ def cov(self, col1, col2): def crosstab(self, col1, col2): """ Computes a pair-wise frequency table of the given columns. Also known as a contingency - table. The number of distinct values for each column should be less than 1e4. The first - column of each row will be the distinct values of `col1` and the column names will be the - distinct values of `col2`. The name of the first column will be `$col1_$col2`. Pairs that - have no occurrences will have `null` as their counts. + table. The number of distinct values for each column should be less than 1e4. At most, 1e6 + non-zero pair frequencies returned will be returned. + The first column of each row will be the distinct values of `col1` and the column names + will be the distinct values of `col2`. The name of the first column will be `$col1_$col2`. + Pairs that have no occurrences will have `null` as their counts. :func:`DataFrame.crosstab` and :func:`DataFrameStatFunctions.crosstab` are aliases. :param col1: The name of the first column. Distinct items will make the first item of diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala index fcf21ca741a7c..e864f76fb39bd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala @@ -65,10 +65,11 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { /** * Computes a pair-wise frequency table of the given columns. Also known as a contingency table. - * The number of distinct values for each column should be less than 1e4. The first - * column of each row will be the distinct values of `col1` and the column names will be the - * distinct values of `col2`. The name of the first column will be `$col1_$col2`. Counts will be - * returned as `Long`s. Pairs that have no occurrences will have `null` as their counts. + * The number of distinct values for each column should be less than 1e4. At most, 1e6 non-zero + * pair frequencies returned will be returned. + * The first column of each row will be the distinct values of `col1` and the column names will + * be the distinct values of `col2`. The name of the first column will be `$col1_$col2`. Counts + * will be returned as `Long`s. Pairs that have no occurrences will have `null` as their counts. * * @param col1 The name of the first column. Distinct items will make the first item of * each row.