Skip to content

Commit

Permalink
[SPARK-32183][DOCS][PYTHON] User Guide - PySpark Usage Guide for Pand…
Browse files Browse the repository at this point in the history
…as with Apache Arrow

### What changes were proposed in this pull request?

This PR proposes to move Arrow usage guide from Spark documentation site to PySpark documentation site (at "User Guide").

Here is the demo for reviewing quicker: https://hyukjin-spark.readthedocs.io/en/stable/user_guide/arrow_pandas.html

### Why are the changes needed?

To have a single place for PySpark users, and better documentation.

### Does this PR introduce _any_ user-facing change?

Yes, it will move https://spark.apache.org/docs/latest/sql-pyspark-pandas-with-arrow.html to our PySpark documentation.

### How was this patch tested?

```bash
cd docs
SKIP_SCALADOC=1 SKIP_RDOC=1 SKIP_SQLDOC=1 jekyll serve --watch
```

and

```bash
cd python/docs
make clean html
```

Closes #29548 from HyukjinKwon/SPARK-32183.

Authored-by: HyukjinKwon <gurwls223@apache.org>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
  • Loading branch information
HyukjinKwon committed Aug 28, 2020
1 parent d6c095c commit c154629
Show file tree
Hide file tree
Showing 7 changed files with 426 additions and 369 deletions.
349 changes: 1 addition & 348 deletions docs/sql-pyspark-pandas-with-arrow.md

Large diffs are not rendered by default.

22 changes: 4 additions & 18 deletions examples/src/main/python/sql/arrow.py
Expand Up @@ -21,6 +21,9 @@
./bin/spark-submit examples/src/main/python/sql/arrow.py
"""

# NOTE that this file is imported in user guide in PySpark documentation.
# The codes are referred via line numbers. See also `literalinclude` directive in Sphinx.

from pyspark.sql import SparkSession
from pyspark.sql.pandas.utils import require_minimum_pandas_version, require_minimum_pyarrow_version

Expand All @@ -29,7 +32,6 @@


def dataframe_with_arrow_example(spark):
# $example on:dataframe_with_arrow$
import numpy as np
import pandas as pd

Expand All @@ -44,12 +46,11 @@ def dataframe_with_arrow_example(spark):

# Convert the Spark DataFrame back to a Pandas DataFrame using Arrow
result_pdf = df.select("*").toPandas()
# $example off:dataframe_with_arrow$

print("Pandas DataFrame result statistics:\n%s\n" % str(result_pdf.describe()))


def ser_to_frame_pandas_udf_example(spark):
# $example on:ser_to_frame_pandas_udf$
import pandas as pd

from pyspark.sql.functions import pandas_udf
Expand All @@ -75,11 +76,9 @@ def func(s1: pd.Series, s2: pd.Series, s3: pd.DataFrame) -> pd.DataFrame:
# |-- func(long_col, string_col, struct_col): struct (nullable = true)
# | |-- col1: string (nullable = true)
# | |-- col2: long (nullable = true)
# $example off:ser_to_frame_pandas_udf$$


def ser_to_ser_pandas_udf_example(spark):
# $example on:ser_to_ser_pandas_udf$
import pandas as pd

from pyspark.sql.functions import col, pandas_udf
Expand Down Expand Up @@ -111,11 +110,9 @@ def multiply_func(a: pd.Series, b: pd.Series) -> pd.Series:
# | 4|
# | 9|
# +-------------------+
# $example off:ser_to_ser_pandas_udf$


def iter_ser_to_iter_ser_pandas_udf_example(spark):
# $example on:iter_ser_to_iter_ser_pandas_udf$
from typing import Iterator

import pandas as pd
Expand All @@ -139,11 +136,9 @@ def plus_one(iterator: Iterator[pd.Series]) -> Iterator[pd.Series]:
# | 3|
# | 4|
# +-----------+
# $example off:iter_ser_to_iter_ser_pandas_udf$


def iter_sers_to_iter_ser_pandas_udf_example(spark):
# $example on:iter_sers_to_iter_ser_pandas_udf$
from typing import Iterator, Tuple

import pandas as pd
Expand All @@ -168,11 +163,9 @@ def multiply_two_cols(
# | 4|
# | 9|
# +-----------------------+
# $example off:iter_sers_to_iter_ser_pandas_udf$


def ser_to_scalar_pandas_udf_example(spark):
# $example on:ser_to_scalar_pandas_udf$
import pandas as pd

from pyspark.sql.functions import pandas_udf
Expand Down Expand Up @@ -215,11 +208,9 @@ def mean_udf(v: pd.Series) -> float:
# | 2| 5.0| 6.0|
# | 2|10.0| 6.0|
# +---+----+------+
# $example off:ser_to_scalar_pandas_udf$


def grouped_apply_in_pandas_example(spark):
# $example on:grouped_apply_in_pandas$
df = spark.createDataFrame(
[(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],
("id", "v"))
Expand All @@ -239,11 +230,9 @@ def subtract_mean(pdf):
# | 2|-1.0|
# | 2| 4.0|
# +---+----+
# $example off:grouped_apply_in_pandas$


def map_in_pandas_example(spark):
# $example on:map_in_pandas$
df = spark.createDataFrame([(1, 21), (2, 30)], ("id", "age"))

def filter_func(iterator):
Expand All @@ -256,11 +245,9 @@ def filter_func(iterator):
# +---+---+
# | 1| 21|
# +---+---+
# $example off:map_in_pandas$


def cogrouped_apply_in_pandas_example(spark):
# $example on:cogrouped_apply_in_pandas$
import pandas as pd

df1 = spark.createDataFrame(
Expand All @@ -284,7 +271,6 @@ def asof_join(l, r):
# |20000101| 2|2.0| y|
# |20000102| 2|4.0| y|
# +--------+---+---+---+
# $example off:cogrouped_apply_in_pandas$


if __name__ == "__main__":
Expand Down
2 changes: 2 additions & 0 deletions python/docs/source/reference/pyspark.sql.rst
Expand Up @@ -33,6 +33,7 @@ Core Classes
Column
Row
GroupedData
PandasCogroupedOps
DataFrameNaFunctions
DataFrameStatFunctions
Window
Expand Down Expand Up @@ -539,4 +540,5 @@ Grouping
GroupedData.min
GroupedData.pivot
GroupedData.sum
PandasCogroupedOps.applyInPandas

0 comments on commit c154629

Please sign in to comment.