apache · nchammas · Mar 2, 2024 · Mar 3, 2024 · Mar 4, 2024 · Mar 4, 2024
diff --git a/...tor/connect/client/jvm/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala b/...tor/connect/client/jvm/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
@@ -259,18 +259,19 @@ class RelationalGroupedDataset private[sql] (
   /**
    * Pivots a column of the current `DataFrame` and performs the specified aggregation.
    *
-   * There are two versions of `pivot` function: one that requires the caller to specify the list
-   * of distinct values to pivot on, and one that does not. The latter is more concise but less
-   * efficient, because Spark needs to first compute the list of distinct values internally.
-   *
    * {{{
    *   // Compute the sum of earnings for each year by course with each course as a separate column
-   *   df.groupBy("year").pivot("course", Seq("dotNET", "Java")).sum("earnings")
-   *
-   *   // Or without specifying column values (less efficient)
    *   df.groupBy("year").pivot("course").sum("earnings")
    * }}}
    *
+   * @note
    * This function will go through the input once to determine the input schema if `inferSchema` 
    * is enabled. To avoid going through the entire data once, disable `inferSchema` option or 
    * specify the schema explicitly using `schema`. 
    * This function will go through the input once to determine the input schema if `inferSchema` 
    * is enabled. To avoid going through the entire data once, disable `inferSchema` option or 
    * specify the schema explicitly using `schema`. 
+   *   Spark will '''eagerly''' compute the distinct values in `pivotColumn` so it can determine
+   *   the resulting schema of the transformation. Depending on the size and complexity of your
+   *   data, this may take some time. In other words, though the pivot transformation is lazy like
+   *   most DataFrame transformations, computing the distinct pivot values is not. To avoid any
+   *   eager computations, provide an explicit list of values via `pivot(pivotColumn: String,
+   *   values: Seq[Any])`.
+   *
    * @see
    *   `org.apache.spark.sql.Dataset.unpivot` for the reverse operation, except for the
    *   aggregation.
@@ -392,14 +393,21 @@ class RelationalGroupedDataset private[sql] (
   }
 
   /**
-   * Pivots a column of the current `DataFrame` and performs the specified aggregation. This is an
-   * overloaded version of the `pivot` method with `pivotColumn` of the `String` type.
+   * Pivots a column of the current `DataFrame` and performs the specified aggregation.
    *
    * {{{
-   *   // Or without specifying column values (less efficient)
+   *   // Compute the sum of earnings for each year by course with each course as a separate column
    *   df.groupBy($"year").pivot($"course").sum($"earnings");
    * }}}
    *
+   * @note
+   *   Spark will '''eagerly''' compute the distinct values in `pivotColumn` so it can determine
+   *   the resulting schema of the transformation. Depending on the size and complexity of your
+   *   data, this may take some time. In other words, though the pivot transformation is lazy like
+   *   most DataFrame transformations, computing the distinct pivot values is not. To avoid any
+   *   eager computations, provide an explicit list of values via `pivot(pivotColumn: Column,
+   *   values: Seq[Any])`.
+   *
    * @see
    *   `org.apache.spark.sql.Dataset.unpivot` for the reverse operation, except for the
    *   aggregation.

diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py
@@ -432,11 +432,7 @@ def sum(self, *cols: str) -> DataFrame:  # type: ignore[empty-body]
 
     def pivot(self, pivot_col: str, values: Optional[List["LiteralType"]] = None) -> "GroupedData":
         """
-        Pivots a column of the current :class:`DataFrame` and perform the specified aggregation.
-        There are two versions of the pivot function: one that requires the caller
-        to specify the list of distinct values to pivot on, and one that does not.
-        The latter is more concise but less efficient,
-        because Spark needs to first compute the list of distinct values internally.
+        Pivots a column of the current :class:`DataFrame` and performs the specified aggregation.
 
         .. versionadded:: 1.6.0
 
@@ -450,6 +446,14 @@ def pivot(self, pivot_col: str, values: Optional[List["LiteralType"]] = None) ->
         values : list, optional
             List of values that will be translated to columns in the output DataFrame.
 
+            .. note:: If ``values`` is not provided, Spark will **eagerly** compute the distinct
+                values in ``pivot_col`` so it can determine the resulting schema of the
+                transformation. Depending on the size and complexity of your data, this may take
+                some time.
+                In other words, though the pivot transformation is lazy like most DataFrame
+                transformations, computing the distinct pivot values is not. To avoid any eager
+                computations, provide an explicit list of values.
+
         Examples
         --------
         >>> from pyspark.sql import Row

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
@@ -324,18 +324,18 @@ class RelationalGroupedDataset protected[sql](
   /**
    * Pivots a column of the current `DataFrame` and performs the specified aggregation.
    *
-   * There are two versions of `pivot` function: one that requires the caller to specify the list
-   * of distinct values to pivot on, and one that does not. The latter is more concise but less
-   * efficient, because Spark needs to first compute the list of distinct values internally.
-   *
    * {{{
    *   // Compute the sum of earnings for each year by course with each course as a separate column
-   *   df.groupBy("year").pivot("course", Seq("dotNET", "Java")).sum("earnings")
-   *
-   *   // Or without specifying column values (less efficient)
    *   df.groupBy("year").pivot("course").sum("earnings")
    * }}}
    *
+   * @note Spark will '''eagerly''' compute the distinct values in `pivotColumn` so it can determine
+   *    the resulting schema of the transformation. Depending on the size and complexity of your
+   *    data, this may take some time. In other words, though the pivot transformation is lazy like
+   *    most DataFrame transformations, computing the distinct pivot values is not. To avoid any
+   *    eager computations, provide an explicit list of values via
+   *    `pivot(pivotColumn: String, values: Seq[Any])`.
+   *
    * @see `org.apache.spark.sql.Dataset.unpivot` for the reverse operation,
    *      except for the aggregation.
    *
@@ -407,13 +407,19 @@ class RelationalGroupedDataset protected[sql](
 
   /**
    * Pivots a column of the current `DataFrame` and performs the specified aggregation.
-   * This is an overloaded version of the `pivot` method with `pivotColumn` of the `String` type.
    *
    * {{{
-   *   // Or without specifying column values (less efficient)
+   *   // Compute the sum of earnings for each year by course with each course as a separate column
    *   df.groupBy($"year").pivot($"course").sum($"earnings");
    * }}}
    *
+   * @note Spark will '''eagerly''' compute the distinct values in `pivotColumn` so it can determine
+   *    the resulting schema of the transformation. Depending on the size and complexity of your
+   *    data, this may take some time. In other words, though the pivot transformation is lazy like
+   *    most DataFrame transformations, computing the distinct pivot values is not. To avoid any
+   *    eager computations, provide an explicit list of values via
+   *    `pivot(pivotColumn: Column, values: Seq[Any])`.
+   *
    * @see `org.apache.spark.sql.Dataset.unpivot` for the reverse operation,
    *      except for the aggregation.
    *