apache · nchammas · Mar 2, 2024 · Mar 3, 2024 · Mar 4, 2024 · Mar 4, 2024
diff --git a/...tor/connect/client/jvm/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala b/...tor/connect/client/jvm/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
@@ -259,15 +259,12 @@ class RelationalGroupedDataset private[sql] (
   /**
    * Pivots a column of the current `DataFrame` and performs the specified aggregation.
    *
-   * There are two versions of `pivot` function: one that requires the caller to specify the list
-   * of distinct values to pivot on, and one that does not. The latter is more concise but less
-   * efficient, because Spark needs to first compute the list of distinct values internally.
+   * Spark will eagerly compute the distinct values in `pivotColumn` so it can determine the
+   * resulting schema of the transformation. To avoid any eager computations, provide an explicit
+   * list of values via `pivot(pivotColumn: String, values: Seq[Any])`.
    *
    * {{{
    *   // Compute the sum of earnings for each year by course with each course as a separate column
-   *   df.groupBy("year").pivot("course", Seq("dotNET", "Java")).sum("earnings")
-   *
-   *   // Or without specifying column values (less efficient)
    *   df.groupBy("year").pivot("course").sum("earnings")
    * }}}
    *
@@ -392,11 +389,14 @@ class RelationalGroupedDataset private[sql] (
   }
 
   /**
-   * Pivots a column of the current `DataFrame` and performs the specified aggregation. This is an
-   * overloaded version of the `pivot` method with `pivotColumn` of the `String` type.
+   * Pivots a column of the current `DataFrame` and performs the specified aggregation.
+   *
+   * Spark will eagerly compute the distinct values in `pivotColumn` so it can determine the
+   * resulting schema of the transformation. To avoid any eager computations, provide an explicit
+   * list of values via `pivot(pivotColumn: Column, values: Seq[Any])`.
    *
    * {{{
-   *   // Or without specifying column values (less efficient)
+   *   // Compute the sum of earnings for each year by course with each course as a separate column
    *   df.groupBy($"year").pivot($"course").sum($"earnings");
    * }}}
    *

diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py
@@ -432,11 +432,7 @@ def sum(self, *cols: str) -> DataFrame:  # type: ignore[empty-body]
 
     def pivot(self, pivot_col: str, values: Optional[List["LiteralType"]] = None) -> "GroupedData":
         """
-        Pivots a column of the current :class:`DataFrame` and perform the specified aggregation.
-        There are two versions of the pivot function: one that requires the caller
-        to specify the list of distinct values to pivot on, and one that does not.
-        The latter is more concise but less efficient,
-        because Spark needs to first compute the list of distinct values internally.
+        Pivots a column of the current :class:`DataFrame` and performs the specified aggregation.
 
         .. versionadded:: 1.6.0
 
@@ -450,6 +446,10 @@ def pivot(self, pivot_col: str, values: Optional[List["LiteralType"]] = None) ->
         values : list, optional
             List of values that will be translated to columns in the output DataFrame.
 
+            If ``values`` is not provided, Spark will eagerly compute the distinct values in
+            ``pivot_col`` so it can determine the resulting schema of the transformation. To avoid
+            any eager computations, provide an explicit list of values.
+
         Examples
         --------
         >>> from pyspark.sql import Row

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
@@ -324,15 +324,12 @@ class RelationalGroupedDataset protected[sql](
   /**
    * Pivots a column of the current `DataFrame` and performs the specified aggregation.
    *
-   * There are two versions of `pivot` function: one that requires the caller to specify the list
-   * of distinct values to pivot on, and one that does not. The latter is more concise but less
-   * efficient, because Spark needs to first compute the list of distinct values internally.
+   * Spark will eagerly compute the distinct values in `pivotColumn` so it can determine
+   * the resulting schema of the transformation. To avoid any eager computations, provide an
+   * explicit list of values via `pivot(pivotColumn: String, values: Seq[Any])`.
    *
    * {{{
    *   // Compute the sum of earnings for each year by course with each course as a separate column
-   *   df.groupBy("year").pivot("course", Seq("dotNET", "Java")).sum("earnings")
-   *
-   *   // Or without specifying column values (less efficient)
    *   df.groupBy("year").pivot("course").sum("earnings")
    * }}}
    *
@@ -407,10 +404,13 @@ class RelationalGroupedDataset protected[sql](
 
   /**
    * Pivots a column of the current `DataFrame` and performs the specified aggregation.
-   * This is an overloaded version of the `pivot` method with `pivotColumn` of the `String` type.
+   *
+   * Spark will eagerly compute the distinct values in `pivotColumn` so it can determine
+   * the resulting schema of the transformation. To avoid any eager computations, provide an
+   * explicit list of values via `pivot(pivotColumn: Column, values: Seq[Any])`.
    *
    * {{{
-   *   // Or without specifying column values (less efficient)
+   *   // Compute the sum of earnings for each year by course with each course as a separate column
    *   df.groupBy($"year").pivot($"course").sum($"earnings");
    * }}}
    *