diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index fca45945a1e66..17448b38c30a1 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -934,8 +934,8 @@ def cov(self, col1, col2): def crosstab(self, col1, col2): """ Computes a pair-wise frequency table of the given columns. Also known as a contingency - table. The number of distinct values for each column should be less than 1e4. At most, 1e6 - non-zero pair frequencies returned will be returned. + table. The number of distinct values for each column should be less than 1e4. At most 1e6 + non-zero pair frequencies will be returned. The first column of each row will be the distinct values of `col1` and the column names will be the distinct values of `col2`. The name of the first column will be `$col1_$col2`. Pairs that have no occurrences will have `null` as their counts. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala index e864f76fb39bd..cb88deab35968 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala @@ -65,8 +65,8 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { /** * Computes a pair-wise frequency table of the given columns. Also known as a contingency table. - * The number of distinct values for each column should be less than 1e4. At most, 1e6 non-zero - * pair frequencies returned will be returned. + * The number of distinct values for each column should be less than 1e4. At most 1e6 non-zero + * pair frequencies will be returned. * The first column of each row will be the distinct values of `col1` and the column names will * be the distinct values of `col2`. The name of the first column will be `$col1_$col2`. Counts * will be returned as `Long`s. Pairs that have no occurrences will have `null` as their counts.