[SPARK-32183][DOCS][PYTHON] User Guide - PySpark Usage Guide for Pand…

…as with Apache Arrow ### What changes were proposed in this pull request? This PR proposes to move Arrow usage guide from Spark documentation site to PySpark documentation site (at "User Guide"). Here is the demo for reviewing quicker: https://hyukjin-spark.readthedocs.io/en/stable/user_guide/arrow_pandas.html ### Why are the changes needed? To have a single place for PySpark users, and better documentation. ### Does this PR introduce _any_ user-facing change? Yes, it will move https://spark.apache.org/docs/latest/sql-pyspark-pandas-with-arrow.html to our PySpark documentation. ### How was this patch tested? ```bash cd docs SKIP_SCALADOC=1 SKIP_RDOC=1 SKIP_SQLDOC=1 jekyll serve --watch ``` and ```bash cd python/docs make clean html ``` Closes #29548 from HyukjinKwon/SPARK-32183. Authored-by: HyukjinKwon <gurwls223@apache.org> Signed-off-by: HyukjinKwon <gurwls223@apache.org>
apache · Aug 28, 2020 · c154629 · c154629
1 parent d6c095c
commit c154629
Show file tree

Hide file tree

Showing 7 changed files with 426 additions and 369 deletions.
diff --git a/docs/sql-pyspark-pandas-with-arrow.md b/docs/sql-pyspark-pandas-with-arrow.md
diff --git a/examples/src/main/python/sql/arrow.py b/examples/src/main/python/sql/arrow.py
@@ -21,6 +21,9 @@
   ./bin/spark-submit examples/src/main/python/sql/arrow.py
 """
 
+# NOTE that this file is imported in user guide in PySpark documentation.
+# The codes are referred via line numbers. See also `literalinclude` directive in Sphinx.
+
 from pyspark.sql import SparkSession
 from pyspark.sql.pandas.utils import require_minimum_pandas_version, require_minimum_pyarrow_version
 
@@ -29,7 +32,6 @@
 
 
 def dataframe_with_arrow_example(spark):
-    # $example on:dataframe_with_arrow$
     import numpy as np
     import pandas as pd
 
@@ -44,12 +46,11 @@ def dataframe_with_arrow_example(spark):
 
     # Convert the Spark DataFrame back to a Pandas DataFrame using Arrow
     result_pdf = df.select("*").toPandas()
-    # $example off:dataframe_with_arrow$
+
     print("Pandas DataFrame result statistics:\n%s\n" % str(result_pdf.describe()))
 
 
 def ser_to_frame_pandas_udf_example(spark):
-    # $example on:ser_to_frame_pandas_udf$
     import pandas as pd
 
     from pyspark.sql.functions import pandas_udf
@@ -75,11 +76,9 @@ def func(s1: pd.Series, s2: pd.Series, s3: pd.DataFrame) -> pd.DataFrame:
     # |-- func(long_col, string_col, struct_col): struct (nullable = true)
     # |    |-- col1: string (nullable = true)
     # |    |-- col2: long (nullable = true)
-    # $example off:ser_to_frame_pandas_udf$$
 
 
 def ser_to_ser_pandas_udf_example(spark):
-    # $example on:ser_to_ser_pandas_udf$
     import pandas as pd
 
     from pyspark.sql.functions import col, pandas_udf
@@ -111,11 +110,9 @@ def multiply_func(a: pd.Series, b: pd.Series) -> pd.Series:
     # |                  4|
     # |                  9|
     # +-------------------+
-    # $example off:ser_to_ser_pandas_udf$
 
 
 def iter_ser_to_iter_ser_pandas_udf_example(spark):
-    # $example on:iter_ser_to_iter_ser_pandas_udf$
     from typing import Iterator
 
     import pandas as pd
@@ -139,11 +136,9 @@ def plus_one(iterator: Iterator[pd.Series]) -> Iterator[pd.Series]:
     # |          3|
     # |          4|
     # +-----------+
-    # $example off:iter_ser_to_iter_ser_pandas_udf$
 
 
 def iter_sers_to_iter_ser_pandas_udf_example(spark):
-    # $example on:iter_sers_to_iter_ser_pandas_udf$
     from typing import Iterator, Tuple
 
     import pandas as pd
@@ -168,11 +163,9 @@ def multiply_two_cols(
     # |                      4|
     # |                      9|
     # +-----------------------+
-    # $example off:iter_sers_to_iter_ser_pandas_udf$
 
 
 def ser_to_scalar_pandas_udf_example(spark):
-    # $example on:ser_to_scalar_pandas_udf$
     import pandas as pd
 
     from pyspark.sql.functions import pandas_udf
@@ -215,11 +208,9 @@ def mean_udf(v: pd.Series) -> float:
     # |  2| 5.0|   6.0|
     # |  2|10.0|   6.0|
     # +---+----+------+
-    # $example off:ser_to_scalar_pandas_udf$
 
 
 def grouped_apply_in_pandas_example(spark):
-    # $example on:grouped_apply_in_pandas$
     df = spark.createDataFrame(
         [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],
         ("id", "v"))
@@ -239,11 +230,9 @@ def subtract_mean(pdf):
     # |  2|-1.0|
     # |  2| 4.0|
     # +---+----+
-    # $example off:grouped_apply_in_pandas$
 
 
 def map_in_pandas_example(spark):
-    # $example on:map_in_pandas$
     df = spark.createDataFrame([(1, 21), (2, 30)], ("id", "age"))
 
     def filter_func(iterator):
@@ -256,11 +245,9 @@ def filter_func(iterator):
     # +---+---+
     # |  1| 21|
     # +---+---+
-    # $example off:map_in_pandas$
 
 
 def cogrouped_apply_in_pandas_example(spark):
-    # $example on:cogrouped_apply_in_pandas$
     import pandas as pd
 
     df1 = spark.createDataFrame(
@@ -284,7 +271,6 @@ def asof_join(l, r):
     # |20000101|  2|2.0|  y|
     # |20000102|  2|4.0|  y|
     # +--------+---+---+---+
-    # $example off:cogrouped_apply_in_pandas$
 
 
 if __name__ == "__main__":

diff --git a/python/docs/source/reference/pyspark.sql.rst b/python/docs/source/reference/pyspark.sql.rst
@@ -33,6 +33,7 @@ Core Classes
     Column
     Row
     GroupedData
+    PandasCogroupedOps
     DataFrameNaFunctions
     DataFrameStatFunctions
     Window
@@ -539,4 +540,5 @@ Grouping
     GroupedData.min
     GroupedData.pivot
     GroupedData.sum
+    PandasCogroupedOps.applyInPandas