From 8ae0dfeb3743d4f3d74fc6aa34b92564ea632d24 Mon Sep 17 00:00:00 2001 From: panbingkun Date: Mon, 20 Nov 2023 11:08:44 +0800 Subject: [PATCH 01/11] [SPARK-45861][PYTHON][DOCS] Add user guide for dataframe creation --- python/docs/source/user_guide/10min.rst | 188 ++++++++++++++++++++++++ python/docs/source/user_guide/index.rst | 1 + 2 files changed, 189 insertions(+) create mode 100644 python/docs/source/user_guide/10min.rst diff --git a/python/docs/source/user_guide/10min.rst b/python/docs/source/user_guide/10min.rst new file mode 100644 index 0000000000000..0feda9c118c1c --- /dev/null +++ b/python/docs/source/user_guide/10min.rst @@ -0,0 +1,188 @@ +.. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +===================== +10 minutes to PySpark +===================== + +.. currentmodule:: pyspark.sql + +This is a short introduction to pyspark, geared mainly for new users. +You can see more complex recipes in the :ref:`/reference/index.rst`. + +Basic data structures +--------------------- + +Pyspark provides an important class for handling data: + +1. :class:`DataFrame`: a distributed collection of data grouped into named columns. + +SparkSession creation +--------------------- +PySpark applications start with initializing :class:`SparkSession` which is the entry point of PySpark as below. +In case of running it in PySpark shell via pyspark executable, the shell automatically creates the session +in the variable spark for users. + +.. code-block:: python + + from pyspark.sql import SparkSession + spark = SparkSession.builder.getOrCreate() + +DataFrame creation +------------------ +A PySpark :class:`DataFrame` can be created via :meth:`SparkSession.createDataFrame` typically by passing +a list of lists, tuples, dictionaries and :class:`Row`, a pandas :class:`pandas.DataFrame` +and an :class:`pyspark.RDD` consisting of such a list. +:meth:`SparkSession.createDataFrame` takes the `schema` argument to specify the schema of the DataFrame. +When it is omitted, PySpark infers the corresponding schema by taking a sample from the data. + +Creating a PySpark :class:`DataFrame` from a list of lists +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + df = spark.createDataFrame([['Alice', 1], ['Bob', 5]]) + df + +DataFrame[_1: string, _2: bigint] + + +Creating a PySpark :class:`DataFrame` from a list of tuples +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + df = spark.createDataFrame([('Alice', 1), ('Bob', 5)]) + df + +DataFrame[_1: string, _2: bigint] + + +Creating a PySpark :class:`DataFrame` from a list of dictionaries +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + df = spark.createDataFrame([{'name': 'Alice', 'age': 1}]) + df + +DataFrame[age: bigint, name: string] + + +Creating a PySpark :class:`DataFrame` from a list of :class:`Row` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + from pyspark.sql import Row + Person = Row('name', 'age') + df = spark.createDataFrame([Person("Alice", 1), Person("Bob", 5)]) + df + +DataFrame[name: string, age: bigint] + + +Creating a PySpark :class:`DataFrame` from a :class:`pandas.DataFrame` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + import pandas as pd + df = spark.createDataFrame(pd.DataFrame([[1, 2]])) + df + +DataFrame[0: bigint, 1: bigint] + + +Creating a PySpark :class:`DataFrame` from a :class:`numpy.ndarray` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + import numpy as np + import pandas as pd + df = spark.createDataFrame(pd.DataFrame(data=np.array([[1, 2], [3, 4]]), columns=['a', 'b'])) + df + +DataFrame[a: bigint, b: bigint] + + +Creating a PySpark :class:`DataFrame` from an :class:`pyspark.RDD` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + from pyspark.sql import Row + rdd = spark.sparkContext.parallelize([Row(name = "Alice", age = 2), Row(name = "Bob", age = 5)]) + df = spark.createDataFrame(rdd) + df + +DataFrame[name: string, age: bigint] + + +Creating a PySpark :class:`DataFrame` by reading existing **json** format file data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + df = spark.read.format("json").load("python/test_support/sql/people.json") + df + +DataFrame[age: bigint, name: string] + + +Creating a PySpark :class:`DataFrame` by reading existing **csv** format file data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + df = spark.read.format("csv").load("python/test_support/sql/people.csv") + df + +DataFrame[age: bigint, name: string] + + +Creating a PySpark :class:`DataFrame` by reading existing **parquet** format file data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + df = spark.read.format("parquet").load("python/test_support/sql/people.parquet") + df + +DataFrame[age: bigint, name: string] + +Creating a PySpark :class:`DataFrame` by reading existing **orc** format file data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + df = spark.read.format("parquet").load("python/test_support/sql/people.orc") + df + +DataFrame[age: bigint, name: string] + + +Creating a PySpark :class:`DataFrame` by reading data from other databases using **JDBC** +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + df = spark.read.format("jdbc").options(url=url, dbtable=dbtable).load() + df + +DataFrame[age: bigint, name: string] diff --git a/python/docs/source/user_guide/index.rst b/python/docs/source/user_guide/index.rst index 67f8c8d4d0fe3..1f8437010e721 100644 --- a/python/docs/source/user_guide/index.rst +++ b/python/docs/source/user_guide/index.rst @@ -26,6 +26,7 @@ PySpark specific user guides are available here: :maxdepth: 2 python_packaging + 10min sql/index pandas_on_spark/index From 99d2ca3dcf4f65d895ff10ecd94b4f8eca29b328 Mon Sep 17 00:00:00 2001 From: panbingkun Date: Mon, 20 Nov 2023 11:38:28 +0800 Subject: [PATCH 02/11] [SPARK-45861][PYTHON][DOCS] Add user guide for dataframe creation --- .../{10min.rst => dataframe_creation.rst} | 27 +++++++------------ python/docs/source/user_guide/index.rst | 2 +- 2 files changed, 10 insertions(+), 19 deletions(-) rename python/docs/source/user_guide/{10min.rst => dataframe_creation.rst} (89%) diff --git a/python/docs/source/user_guide/10min.rst b/python/docs/source/user_guide/dataframe_creation.rst similarity index 89% rename from python/docs/source/user_guide/10min.rst rename to python/docs/source/user_guide/dataframe_creation.rst index 0feda9c118c1c..d57f3e5b981ac 100644 --- a/python/docs/source/user_guide/10min.rst +++ b/python/docs/source/user_guide/dataframe_creation.rst @@ -15,15 +15,12 @@ specific language governing permissions and limitations under the License. -===================== -10 minutes to PySpark -===================== +================== +DataFrame creation +================== .. currentmodule:: pyspark.sql -This is a short introduction to pyspark, geared mainly for new users. -You can see more complex recipes in the :ref:`/reference/index.rst`. - Basic data structures --------------------- @@ -31,19 +28,9 @@ Pyspark provides an important class for handling data: 1. :class:`DataFrame`: a distributed collection of data grouped into named columns. -SparkSession creation ---------------------- -PySpark applications start with initializing :class:`SparkSession` which is the entry point of PySpark as below. -In case of running it in PySpark shell via pyspark executable, the shell automatically creates the session -in the variable spark for users. - -.. code-block:: python +Creating through `createDataFrame` +---------------------------------- - from pyspark.sql import SparkSession - spark = SparkSession.builder.getOrCreate() - -DataFrame creation ------------------- A PySpark :class:`DataFrame` can be created via :meth:`SparkSession.createDataFrame` typically by passing a list of lists, tuples, dictionaries and :class:`Row`, a pandas :class:`pandas.DataFrame` and an :class:`pyspark.RDD` consisting of such a list. @@ -134,6 +121,9 @@ Creating a PySpark :class:`DataFrame` from an :class:`pyspark.RDD` DataFrame[name: string, age: bigint] +Creating through `read.format(...).load(...)` +--------------------------------------------- + Creating a PySpark :class:`DataFrame` by reading existing **json** format file data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -186,3 +176,4 @@ Creating a PySpark :class:`DataFrame` by reading data from other databases using df DataFrame[age: bigint, name: string] + diff --git a/python/docs/source/user_guide/index.rst b/python/docs/source/user_guide/index.rst index 1f8437010e721..86f1a37b7f557 100644 --- a/python/docs/source/user_guide/index.rst +++ b/python/docs/source/user_guide/index.rst @@ -26,7 +26,7 @@ PySpark specific user guides are available here: :maxdepth: 2 python_packaging - 10min + dataframe_creation sql/index pandas_on_spark/index From 74b86c2900959f7270b9587bc4ce651293173eee Mon Sep 17 00:00:00 2001 From: panbingkun Date: Tue, 21 Nov 2023 18:56:36 +0800 Subject: [PATCH 03/11] [SPARK-45861][PYTHON][DOCS] Add user guide for dataframe creation --- .../source/user_guide/dataframe_creation.rst | 42 +++++++++---------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/python/docs/source/user_guide/dataframe_creation.rst b/python/docs/source/user_guide/dataframe_creation.rst index d57f3e5b981ac..2a2b3b24a9890 100644 --- a/python/docs/source/user_guide/dataframe_creation.rst +++ b/python/docs/source/user_guide/dataframe_creation.rst @@ -16,25 +16,25 @@ under the License. ================== -DataFrame creation +DataFrame Creation ================== .. currentmodule:: pyspark.sql -Basic data structures +Basic Data Structures --------------------- -Pyspark provides an important class for handling data: +PySpark provides an important class for handling data: 1. :class:`DataFrame`: a distributed collection of data grouped into named columns. -Creating through `createDataFrame` +Creating Through `createDataFrame` ---------------------------------- A PySpark :class:`DataFrame` can be created via :meth:`SparkSession.createDataFrame` typically by passing -a list of lists, tuples, dictionaries and :class:`Row`, a pandas :class:`pandas.DataFrame` -and an :class:`pyspark.RDD` consisting of such a list. -:meth:`SparkSession.createDataFrame` takes the `schema` argument to specify the schema of the DataFrame. +a list of lists, tuples, dictionaries and :class:`Row`, a pandas :class:`pandas.DataFrame`, +a NumPy :class:`numpy.ndarray` and an :class:`pyspark.RDD`. +:meth:`SparkSession.createDataFrame` takes the `schema` argument to specify the schema of the :class:`DataFrame`. When it is omitted, PySpark infers the corresponding schema by taking a sample from the data. Creating a PySpark :class:`DataFrame` from a list of lists @@ -59,6 +59,19 @@ Creating a PySpark :class:`DataFrame` from a list of tuples DataFrame[_1: string, _2: bigint] +Creating a PySpark :class:`DataFrame` with the explicit schema specified +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + from pyspark.sql.types import * + schema = StructType([StructField("name", StringType(), True), StructField("age", IntegerType(), True)]) + df = spark.createDataFrame([('Alice', 1), ('Bob', 5)], schema) + df + +DataFrame[name: string, age: int] + + Creating a PySpark :class:`DataFrame` from a list of dictionaries ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -108,20 +121,7 @@ Creating a PySpark :class:`DataFrame` from a :class:`numpy.ndarray` DataFrame[a: bigint, b: bigint] -Creating a PySpark :class:`DataFrame` from an :class:`pyspark.RDD` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code-block:: python - - from pyspark.sql import Row - rdd = spark.sparkContext.parallelize([Row(name = "Alice", age = 2), Row(name = "Bob", age = 5)]) - df = spark.createDataFrame(rdd) - df - -DataFrame[name: string, age: bigint] - - -Creating through `read.format(...).load(...)` +Creating Through `read.format(...).load(...)` --------------------------------------------- Creating a PySpark :class:`DataFrame` by reading existing **json** format file data From 62ad78ddd2901784425a534f9cfdeb8c94554d77 Mon Sep 17 00:00:00 2001 From: panbingkun Date: Thu, 23 Nov 2023 16:29:45 +0800 Subject: [PATCH 04/11] [SPARK-45861][PYTHON][DOCS] Add user guide for dataframe creation --- .../source/user_guide/dataframe_creation.rst | 164 ++++++++++++------ 1 file changed, 109 insertions(+), 55 deletions(-) diff --git a/python/docs/source/user_guide/dataframe_creation.rst b/python/docs/source/user_guide/dataframe_creation.rst index 2a2b3b24a9890..04c08e65aae59 100644 --- a/python/docs/source/user_guide/dataframe_creation.rst +++ b/python/docs/source/user_guide/dataframe_creation.rst @@ -42,10 +42,14 @@ Creating a PySpark :class:`DataFrame` from a list of lists .. code-block:: python - df = spark.createDataFrame([['Alice', 1], ['Bob', 5]]) - df - -DataFrame[_1: string, _2: bigint] + >>> df = spark.createDataFrame([['Alice', 1], ['Bob', 5]]) + >>> df.show() + +-----+---+ + | _1| _2| + +-----+---+ + |Alice| 1| + | Bob| 5| + +-----+---+ Creating a PySpark :class:`DataFrame` from a list of tuples @@ -53,10 +57,14 @@ Creating a PySpark :class:`DataFrame` from a list of tuples .. code-block:: python - df = spark.createDataFrame([('Alice', 1), ('Bob', 5)]) - df - -DataFrame[_1: string, _2: bigint] + >>> df = spark.createDataFrame([('Alice', 1), ('Bob', 5)]) + >>> df.show() + +-----+---+ + | _1| _2| + +-----+---+ + |Alice| 1| + | Bob| 5| + +-----+---+ Creating a PySpark :class:`DataFrame` with the explicit schema specified @@ -64,12 +72,17 @@ Creating a PySpark :class:`DataFrame` with the explicit schema specified .. code-block:: python - from pyspark.sql.types import * - schema = StructType([StructField("name", StringType(), True), StructField("age", IntegerType(), True)]) - df = spark.createDataFrame([('Alice', 1), ('Bob', 5)], schema) - df - -DataFrame[name: string, age: int] + >>> from pyspark.sql.types import * + >>> schema = StructType([StructField("name", StringType(), True), + ... StructField("age", IntegerType(), True)]) + >>> df = spark.createDataFrame([('Alice', 1), ('Bob', 5)], schema) + >>> df.show() + +-----+---+ + | name|age| + +-----+---+ + |Alice| 1| + | Bob| 5| + +-----+---+ Creating a PySpark :class:`DataFrame` from a list of dictionaries @@ -77,10 +90,13 @@ Creating a PySpark :class:`DataFrame` from a list of dictionaries .. code-block:: python - df = spark.createDataFrame([{'name': 'Alice', 'age': 1}]) - df - -DataFrame[age: bigint, name: string] + >>> df = spark.createDataFrame([{'name': 'Alice', 'age': 1}]) + >>> df.show() + +---+-----+ + |age| name| + +---+-----+ + | 1|Alice| + +---+-----+ Creating a PySpark :class:`DataFrame` from a list of :class:`Row` @@ -88,12 +104,16 @@ Creating a PySpark :class:`DataFrame` from a list of :class:`Row` .. code-block:: python - from pyspark.sql import Row - Person = Row('name', 'age') - df = spark.createDataFrame([Person("Alice", 1), Person("Bob", 5)]) - df - -DataFrame[name: string, age: bigint] + >>> from pyspark.sql import Row + >>> Person = Row('name', 'age') + >>> df = spark.createDataFrame([Person("Alice", 1), Person("Bob", 5)]) + >>> df.show() + +-----+---+ + | name|age| + +-----+---+ + |Alice| 1| + | Bob| 5| + +-----+---+ Creating a PySpark :class:`DataFrame` from a :class:`pandas.DataFrame` @@ -101,11 +121,14 @@ Creating a PySpark :class:`DataFrame` from a :class:`pandas.DataFrame` .. code-block:: python - import pandas as pd - df = spark.createDataFrame(pd.DataFrame([[1, 2]])) - df - -DataFrame[0: bigint, 1: bigint] + >>> import pandas as pd + >>> df = spark.createDataFrame(pd.DataFrame([[1, 2]])) + >>> df.show() + +---+---+ + | 0| 1| + +---+---+ + | 1| 2| + +---+---+ Creating a PySpark :class:`DataFrame` from a :class:`numpy.ndarray` @@ -113,12 +136,17 @@ Creating a PySpark :class:`DataFrame` from a :class:`numpy.ndarray` .. code-block:: python - import numpy as np - import pandas as pd - df = spark.createDataFrame(pd.DataFrame(data=np.array([[1, 2], [3, 4]]), columns=['a', 'b'])) - df - -DataFrame[a: bigint, b: bigint] + >>> import numpy as np + >>> import pandas as pd + >>> df = spark.createDataFrame(pd.DataFrame(data=np.array([[1, 2], [3, 4]]), + ... columns=['a', 'b'])) + >>> df.show() + +---+---+ + | a| b| + +---+---+ + | 1| 2| + | 3| 4| + +---+---+ Creating Through `read.format(...).load(...)` @@ -129,10 +157,15 @@ Creating a PySpark :class:`DataFrame` by reading existing **json** format file d .. code-block:: python - df = spark.read.format("json").load("python/test_support/sql/people.json") - df - -DataFrame[age: bigint, name: string] + >>> df = spark.read.format("json").load("python/test_support/sql/people.json") + >>> df.show() + +----+-------+ + | age| name| + +----+-------+ + |NULL|Michael| + | 30| Andy| + | 19| Justin| + +----+-------+ Creating a PySpark :class:`DataFrame` by reading existing **csv** format file data @@ -140,10 +173,16 @@ Creating a PySpark :class:`DataFrame` by reading existing **csv** format file da .. code-block:: python - df = spark.read.format("csv").load("python/test_support/sql/people.csv") - df - -DataFrame[age: bigint, name: string] + >>> df = spark.read.format("csv").option("header", "true"). + ... load("python/test_support/sql/people.csv") + >>> df.show() + +----+-------+ + | age| name| + +----+-------+ + |NULL|Michael| + | 30| Andy| + | 19| Justin| + +----+-------+ Creating a PySpark :class:`DataFrame` by reading existing **parquet** format file data @@ -151,20 +190,30 @@ Creating a PySpark :class:`DataFrame` by reading existing **parquet** format fil .. code-block:: python - df = spark.read.format("parquet").load("python/test_support/sql/people.parquet") - df - -DataFrame[age: bigint, name: string] + >>> df = spark.read.format("parquet").load("python/test_support/sql/people.parquet") + >>> df.show() + +----+-------+ + | age| name| + +----+-------+ + |NULL|Michael| + | 30| Andy| + | 19| Justin| + +----+-------+ Creating a PySpark :class:`DataFrame` by reading existing **orc** format file data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: python - df = spark.read.format("parquet").load("python/test_support/sql/people.orc") - df - -DataFrame[age: bigint, name: string] + >>> df = spark.read.format("parquet").load("python/test_support/sql/people.orc") + >>> df.show() + +----+-------+ + | age| name| + +----+-------+ + |NULL|Michael| + | 30| Andy| + | 19| Justin| + +----+-------+ Creating a PySpark :class:`DataFrame` by reading data from other databases using **JDBC** @@ -172,8 +221,13 @@ Creating a PySpark :class:`DataFrame` by reading data from other databases using .. code-block:: python - df = spark.read.format("jdbc").options(url=url, dbtable=dbtable).load() - df - -DataFrame[age: bigint, name: string] + >>> df = spark.read.format("jdbc").options(url=url, dbtable=dbtable).load() + >>> df.show() + +----+-------+ + | age| name| + +----+-------+ + |NULL|Michael| + | 30| Andy| + | 19| Justin| + +----+-------+ From 6f3cbbb8e3e3a670c96d60a7190c2462270c2c3e Mon Sep 17 00:00:00 2001 From: panbingkun Date: Fri, 24 Nov 2023 11:22:13 +0800 Subject: [PATCH 05/11] Apply suggestions from code review Co-authored-by: Hyukjin Kwon --- python/docs/source/user_guide/dataframe_creation.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/docs/source/user_guide/dataframe_creation.rst b/python/docs/source/user_guide/dataframe_creation.rst index 04c08e65aae59..be60f74d267f8 100644 --- a/python/docs/source/user_guide/dataframe_creation.rst +++ b/python/docs/source/user_guide/dataframe_creation.rst @@ -149,7 +149,7 @@ Creating a PySpark :class:`DataFrame` from a :class:`numpy.ndarray` +---+---+ -Creating Through `read.format(...).load(...)` +Creating through `read.format(...).load(...)` --------------------------------------------- Creating a PySpark :class:`DataFrame` by reading existing **json** format file data @@ -173,8 +173,8 @@ Creating a PySpark :class:`DataFrame` by reading existing **csv** format file da .. code-block:: python - >>> df = spark.read.format("csv").option("header", "true"). - ... load("python/test_support/sql/people.csv") + >>> df = spark.read.format("csv").option("header", "true").load( + ... "python/test_support/sql/people.csv") >>> df.show() +----+-------+ | age| name| From 84df90c4e070b5f7dda687d7ebb060242601b11b Mon Sep 17 00:00:00 2001 From: panbingkun Date: Fri, 24 Nov 2023 15:57:11 +0800 Subject: [PATCH 06/11] [SPARK-45861][PYTHON][DOCS] Add user guide for dataframe creation --- python/docs/source/user_guide/index.rst | 1 - .../{ => sql}/dataframe_creation.rst | 70 +++++++++++-------- python/docs/source/user_guide/sql/index.rst | 7 +- 3 files changed, 43 insertions(+), 35 deletions(-) rename python/docs/source/user_guide/{ => sql}/dataframe_creation.rst (80%) diff --git a/python/docs/source/user_guide/index.rst b/python/docs/source/user_guide/index.rst index 86f1a37b7f557..67f8c8d4d0fe3 100644 --- a/python/docs/source/user_guide/index.rst +++ b/python/docs/source/user_guide/index.rst @@ -26,7 +26,6 @@ PySpark specific user guides are available here: :maxdepth: 2 python_packaging - dataframe_creation sql/index pandas_on_spark/index diff --git a/python/docs/source/user_guide/dataframe_creation.rst b/python/docs/source/user_guide/sql/dataframe_creation.rst similarity index 80% rename from python/docs/source/user_guide/dataframe_creation.rst rename to python/docs/source/user_guide/sql/dataframe_creation.rst index be60f74d267f8..bc3476c68c342 100644 --- a/python/docs/source/user_guide/dataframe_creation.rst +++ b/python/docs/source/user_guide/sql/dataframe_creation.rst @@ -21,14 +21,7 @@ DataFrame Creation .. currentmodule:: pyspark.sql -Basic Data Structures ---------------------- - -PySpark provides an important class for handling data: - -1. :class:`DataFrame`: a distributed collection of data grouped into named columns. - -Creating Through `createDataFrame` +Creating through `createDataFrame` ---------------------------------- A PySpark :class:`DataFrame` can be created via :meth:`SparkSession.createDataFrame` typically by passing @@ -85,6 +78,21 @@ Creating a PySpark :class:`DataFrame` with the explicit schema specified +-----+---+ +Creating a PySpark :class:`DataFrame` with the explicit DDL-formatted string schema specified +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + >>> df = spark.createDataFrame([('Alice', 1), ('Bob', 5)], schema = "name string, age int") + >>> df.show() + +-----+---+ + | name|age| + +-----+---+ + |Alice| 1| + | Bob| 5| + +-----+---+ + + Creating a PySpark :class:`DataFrame` from a list of dictionaries ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -190,14 +198,22 @@ Creating a PySpark :class:`DataFrame` by reading existing **parquet** format fil .. code-block:: python - >>> df = spark.read.format("parquet").load("python/test_support/sql/people.parquet") - >>> df.show() + >>> # Write a Parquet file to the temporary directory, and read it back + >>> import tempfile + >>> with tempfile.TemporaryDirectory() as d: + ... # Overwrite the path with a new Parquet file + ... spark.createDataFrame( + ... [{"age": None, "name": "Michael"}, {"age": 30, "name": "Andy"}] + ... ).write.mode("overwrite").format("parquet").save(d) + ... + ... # Read the Parquet file as a DataFrame + ... df = spark.read.format("parquet").load(d) + ... df.show() +----+-------+ | age| name| +----+-------+ - |NULL|Michael| | 30| Andy| - | 19| Justin| + |NULL|Michael| +----+-------+ Creating a PySpark :class:`DataFrame` by reading existing **orc** format file data @@ -205,29 +221,21 @@ Creating a PySpark :class:`DataFrame` by reading existing **orc** format file da .. code-block:: python - >>> df = spark.read.format("parquet").load("python/test_support/sql/people.orc") - >>> df.show() + >>> # Write a Orc file to the temporary directory, and read it back + >>> import tempfile + >>> with tempfile.TemporaryDirectory() as d: + ... # Overwrite the path with a new Orc file + ... spark.createDataFrame( + ... [{"age": None, "name": "Michael"}, {"age": 30, "name": "Andy"}] + ... ).write.mode("overwrite").format("orc").save(d) + ... + ... # Read the Orc file as a DataFrame + ... df = spark.read.format("orc").load(d) + ... df.show() +----+-------+ | age| name| +----+-------+ - |NULL|Michael| | 30| Andy| - | 19| Justin| - +----+-------+ - - -Creating a PySpark :class:`DataFrame` by reading data from other databases using **JDBC** -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code-block:: python - - >>> df = spark.read.format("jdbc").options(url=url, dbtable=dbtable).load() - >>> df.show() - +----+-------+ - | age| name| - +----+-------+ |NULL|Michael| - | 30| Andy| - | 19| Justin| +----+-------+ diff --git a/python/docs/source/user_guide/sql/index.rst b/python/docs/source/user_guide/sql/index.rst index 118cf139d9b38..e4da562503191 100644 --- a/python/docs/source/user_guide/sql/index.rst +++ b/python/docs/source/user_guide/sql/index.rst @@ -16,13 +16,14 @@ under the License. -========= -Spark SQL -========= +================= +DataFrame and SQL +================= .. toctree:: :maxdepth: 2 + dataframe_creation arrow_pandas python_udtf type_conversions From 98195bb7dfde952f829504c0699b8355650da127 Mon Sep 17 00:00:00 2001 From: panbingkun Date: Fri, 24 Nov 2023 16:33:44 +0800 Subject: [PATCH 07/11] [SPARK-45861][PYTHON][DOCS] Add user guide for dataframe creation --- python/docs/source/user_guide/sql/dataframe_creation.rst | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/docs/source/user_guide/sql/dataframe_creation.rst b/python/docs/source/user_guide/sql/dataframe_creation.rst index bc3476c68c342..28d794232604a 100644 --- a/python/docs/source/user_guide/sql/dataframe_creation.rst +++ b/python/docs/source/user_guide/sql/dataframe_creation.rst @@ -205,7 +205,6 @@ Creating a PySpark :class:`DataFrame` by reading existing **parquet** format fil ... spark.createDataFrame( ... [{"age": None, "name": "Michael"}, {"age": 30, "name": "Andy"}] ... ).write.mode("overwrite").format("parquet").save(d) - ... ... # Read the Parquet file as a DataFrame ... df = spark.read.format("parquet").load(d) ... df.show() @@ -228,7 +227,6 @@ Creating a PySpark :class:`DataFrame` by reading existing **orc** format file da ... spark.createDataFrame( ... [{"age": None, "name": "Michael"}, {"age": 30, "name": "Andy"}] ... ).write.mode("overwrite").format("orc").save(d) - ... ... # Read the Orc file as a DataFrame ... df = spark.read.format("orc").load(d) ... df.show() From 8b4d8d5767a2d138f512fccd75da6e5bedd55350 Mon Sep 17 00:00:00 2001 From: panbingkun Date: Thu, 30 Nov 2023 09:24:47 +0800 Subject: [PATCH 08/11] Apply suggestions from code review Co-authored-by: allisonwang-db --- .../user_guide/sql/dataframe_creation.rst | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/python/docs/source/user_guide/sql/dataframe_creation.rst b/python/docs/source/user_guide/sql/dataframe_creation.rst index 28d794232604a..3ea7d4154020c 100644 --- a/python/docs/source/user_guide/sql/dataframe_creation.rst +++ b/python/docs/source/user_guide/sql/dataframe_creation.rst @@ -18,7 +18,7 @@ ================== DataFrame Creation ================== - +PySpark allows you to create DataFrames in several ways. Let's explore these methods with simple examples. .. currentmodule:: pyspark.sql Creating through `createDataFrame` @@ -30,7 +30,7 @@ a NumPy :class:`numpy.ndarray` and an :class:`pyspark.RDD`. :meth:`SparkSession.createDataFrame` takes the `schema` argument to specify the schema of the :class:`DataFrame`. When it is omitted, PySpark infers the corresponding schema by taking a sample from the data. -Creating a PySpark :class:`DataFrame` from a list of lists +Creating a :class:`DataFrame` from Lists ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: python @@ -60,14 +60,16 @@ Creating a PySpark :class:`DataFrame` from a list of tuples +-----+---+ -Creating a PySpark :class:`DataFrame` with the explicit schema specified +Creating a :class:`DataFrame` with a Specified Schema ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - +Define a schema and use it to create a DataFrame. A schema describes the column names and types. .. code-block:: python >>> from pyspark.sql.types import * - >>> schema = StructType([StructField("name", StringType(), True), - ... StructField("age", IntegerType(), True)]) +schema = StructType([ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True) +]) >>> df = spark.createDataFrame([('Alice', 1), ('Bob', 5)], schema) >>> df.show() +-----+---+ @@ -93,8 +95,9 @@ Creating a PySpark :class:`DataFrame` with the explicit DDL-formatted string sch +-----+---+ -Creating a PySpark :class:`DataFrame` from a list of dictionaries +Creating a :class:`DataFrame` from Dictionaries ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Dictionaries with keys as column names can also be used. .. code-block:: python @@ -107,8 +110,9 @@ Creating a PySpark :class:`DataFrame` from a list of dictionaries +---+-----+ -Creating a PySpark :class:`DataFrame` from a list of :class:`Row` +Creating a :class:`DataFrame` from :class:`Row`s ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Use the Row type to define rows of a DataFrame. .. code-block:: python @@ -124,7 +128,7 @@ Creating a PySpark :class:`DataFrame` from a list of :class:`Row` +-----+---+ -Creating a PySpark :class:`DataFrame` from a :class:`pandas.DataFrame` +Creating a :class:`DataFrame` from a :class:`pandas.DataFrame` or a :class:`numpy.ndarray` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: python From 1ffcf88de442c7f39f259c8cf1ff7a180c79ddc1 Mon Sep 17 00:00:00 2001 From: panbingkun Date: Thu, 30 Nov 2023 10:16:02 +0800 Subject: [PATCH 09/11] [SPARK-45861][PYTHON][DOCS] Add user guide for dataframe creation --- ...e_creation.rst => creating_dataframes.rst} | 85 +++++++------------ python/docs/source/user_guide/sql/index.rst | 8 +- 2 files changed, 36 insertions(+), 57 deletions(-) rename python/docs/source/user_guide/sql/{dataframe_creation.rst => creating_dataframes.rst} (65%) diff --git a/python/docs/source/user_guide/sql/dataframe_creation.rst b/python/docs/source/user_guide/sql/creating_dataframes.rst similarity index 65% rename from python/docs/source/user_guide/sql/dataframe_creation.rst rename to python/docs/source/user_guide/sql/creating_dataframes.rst index 3ea7d4154020c..021b58c325a7d 100644 --- a/python/docs/source/user_guide/sql/dataframe_creation.rst +++ b/python/docs/source/user_guide/sql/creating_dataframes.rst @@ -15,25 +15,20 @@ specific language governing permissions and limitations under the License. -================== -DataFrame Creation -================== -PySpark allows you to create DataFrames in several ways. Let's explore these methods with simple examples. -.. currentmodule:: pyspark.sql -Creating through `createDataFrame` ----------------------------------- +=================== +Creating DataFrames +=================== + +PySpark allows you to create :class:`DataFrame`s in several ways. Let's explore these methods with simple examples. -A PySpark :class:`DataFrame` can be created via :meth:`SparkSession.createDataFrame` typically by passing -a list of lists, tuples, dictionaries and :class:`Row`, a pandas :class:`pandas.DataFrame`, -a NumPy :class:`numpy.ndarray` and an :class:`pyspark.RDD`. -:meth:`SparkSession.createDataFrame` takes the `schema` argument to specify the schema of the :class:`DataFrame`. -When it is omitted, PySpark infers the corresponding schema by taking a sample from the data. +.. currentmodule:: pyspark.sql Creating a :class:`DataFrame` from Lists -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: python + :emphasize-lines: 4 >>> df = spark.createDataFrame([['Alice', 1], ['Bob', 5]]) >>> df.show() @@ -45,31 +40,18 @@ Creating a :class:`DataFrame` from Lists +-----+---+ -Creating a PySpark :class:`DataFrame` from a list of tuples -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code-block:: python - - >>> df = spark.createDataFrame([('Alice', 1), ('Bob', 5)]) - >>> df.show() - +-----+---+ - | _1| _2| - +-----+---+ - |Alice| 1| - | Bob| 5| - +-----+---+ - - Creating a :class:`DataFrame` with a Specified Schema -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Define a schema and use it to create a DataFrame. A schema describes the column names and types. + .. code-block:: python - >>> from pyspark.sql.types import * -schema = StructType([ - StructField("name", StringType(), True), - StructField("age", IntegerType(), True) -]) + >>> from pyspark.sql.types import StructType, StructField, StringType, IntegerType + >>> schema = StructType([ + ... StructField("name", StringType(), True), + ... StructField("age", IntegerType(), True) + ... ]) >>> df = spark.createDataFrame([('Alice', 1), ('Bob', 5)], schema) >>> df.show() +-----+---+ @@ -79,9 +61,7 @@ schema = StructType([ | Bob| 5| +-----+---+ - -Creating a PySpark :class:`DataFrame` with the explicit DDL-formatted string schema specified -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Use DDL-formatted string schema to create a DataFrame. .. code-block:: python @@ -96,7 +76,8 @@ Creating a PySpark :class:`DataFrame` with the explicit DDL-formatted string sch Creating a :class:`DataFrame` from Dictionaries -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Dictionaries with keys as column names can also be used. .. code-block:: python @@ -111,7 +92,8 @@ Dictionaries with keys as column names can also be used. Creating a :class:`DataFrame` from :class:`Row`s -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Use the Row type to define rows of a DataFrame. .. code-block:: python @@ -129,7 +111,7 @@ Use the Row type to define rows of a DataFrame. Creating a :class:`DataFrame` from a :class:`pandas.DataFrame` or a :class:`numpy.ndarray` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: python @@ -143,9 +125,6 @@ Creating a :class:`DataFrame` from a :class:`pandas.DataFrame` or a :class:`nump +---+---+ -Creating a PySpark :class:`DataFrame` from a :class:`numpy.ndarray` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - .. code-block:: python >>> import numpy as np @@ -161,11 +140,11 @@ Creating a PySpark :class:`DataFrame` from a :class:`numpy.ndarray` +---+---+ -Creating through `read.format(...).load(...)` ---------------------------------------------- +Reading Data from Files +----------------------- -Creating a PySpark :class:`DataFrame` by reading existing **json** format file data -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Example with **JSON** +~~~~~~~~~~~~~~~~~~~~~ .. code-block:: python @@ -180,8 +159,8 @@ Creating a PySpark :class:`DataFrame` by reading existing **json** format file d +----+-------+ -Creating a PySpark :class:`DataFrame` by reading existing **csv** format file data -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Example with **CSV** +~~~~~~~~~~~~~~~~~~~~ .. code-block:: python @@ -197,8 +176,8 @@ Creating a PySpark :class:`DataFrame` by reading existing **csv** format file da +----+-------+ -Creating a PySpark :class:`DataFrame` by reading existing **parquet** format file data -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Example with **PARQUET** +~~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: python @@ -219,8 +198,8 @@ Creating a PySpark :class:`DataFrame` by reading existing **parquet** format fil |NULL|Michael| +----+-------+ -Creating a PySpark :class:`DataFrame` by reading existing **orc** format file data -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Example with **ORC** +~~~~~~~~~~~~~~~~~~~~ .. code-block:: python diff --git a/python/docs/source/user_guide/sql/index.rst b/python/docs/source/user_guide/sql/index.rst index e4da562503191..1d3c9057f99e5 100644 --- a/python/docs/source/user_guide/sql/index.rst +++ b/python/docs/source/user_guide/sql/index.rst @@ -16,14 +16,14 @@ under the License. -================= -DataFrame and SQL -================= +======================== +Spark SQL and DataFrames +======================== .. toctree:: :maxdepth: 2 - dataframe_creation + creating_dataframes arrow_pandas python_udtf type_conversions From 3feff95acf8898b50f783beed741da9b0556d850 Mon Sep 17 00:00:00 2001 From: panbingkun Date: Thu, 30 Nov 2023 12:03:16 +0800 Subject: [PATCH 10/11] [SPARK-45861][PYTHON][DOCS] Add user guide for dataframe creation --- .../user_guide/sql/creating_dataframes.rst | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/python/docs/source/user_guide/sql/creating_dataframes.rst b/python/docs/source/user_guide/sql/creating_dataframes.rst index 021b58c325a7d..507f632479421 100644 --- a/python/docs/source/user_guide/sql/creating_dataframes.rst +++ b/python/docs/source/user_guide/sql/creating_dataframes.rst @@ -20,16 +20,17 @@ Creating DataFrames =================== -PySpark allows you to create :class:`DataFrame`s in several ways. Let's explore these methods with simple examples. - .. currentmodule:: pyspark.sql +PySpark allows you to create :class:`DataFrame`\s in several ways. Let's explore these methods with simple examples. + Creating a :class:`DataFrame` from Lists ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: python - :emphasize-lines: 4 + :emphasize-lines: 5 + >>> # when the schema is not provided, the resulting DataFrame has _1 and _2 as the schema >>> df = spark.createDataFrame([['Alice', 1], ['Bob', 5]]) >>> df.show() +-----+---+ @@ -43,7 +44,7 @@ Creating a :class:`DataFrame` from Lists Creating a :class:`DataFrame` with a Specified Schema ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Define a schema and use it to create a DataFrame. A schema describes the column names and types. +Define a schema and use it to create a :class:`DataFrame`. A schema describes the column names and types. .. code-block:: python @@ -51,7 +52,7 @@ Define a schema and use it to create a DataFrame. A schema describes the column >>> schema = StructType([ ... StructField("name", StringType(), True), ... StructField("age", IntegerType(), True) - ... ]) + ... ]) >>> df = spark.createDataFrame([('Alice', 1), ('Bob', 5)], schema) >>> df.show() +-----+---+ @@ -61,7 +62,7 @@ Define a schema and use it to create a DataFrame. A schema describes the column | Bob| 5| +-----+---+ -Use DDL-formatted string schema to create a DataFrame. +Use DDL-formatted string schema to create a :class:`DataFrame`. .. code-block:: python @@ -91,10 +92,10 @@ Dictionaries with keys as column names can also be used. +---+-----+ -Creating a :class:`DataFrame` from :class:`Row`s -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Creating a :class:`DataFrame` from :class:`Row`\s +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Use the Row type to define rows of a DataFrame. +Use the :class:`Row` type to define rows of a :class:`DataFrame`. .. code-block:: python From 2f0ae13a9afada93b5598439d3e7ae4f22113c89 Mon Sep 17 00:00:00 2001 From: panbingkun Date: Thu, 14 Dec 2023 11:43:02 +0800 Subject: [PATCH 11/11] Apply suggestions from code review Co-authored-by: Nicholas Chammas --- python/docs/source/user_guide/sql/creating_dataframes.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/docs/source/user_guide/sql/creating_dataframes.rst b/python/docs/source/user_guide/sql/creating_dataframes.rst index 507f632479421..e81f27f2b4d46 100644 --- a/python/docs/source/user_guide/sql/creating_dataframes.rst +++ b/python/docs/source/user_guide/sql/creating_dataframes.rst @@ -182,7 +182,7 @@ Example with **PARQUET** .. code-block:: python - >>> # Write a Parquet file to the temporary directory, and read it back + >>> # Write a Parquet file to a temporary directory and read it back. >>> import tempfile >>> with tempfile.TemporaryDirectory() as d: ... # Overwrite the path with a new Parquet file @@ -204,14 +204,14 @@ Example with **ORC** .. code-block:: python - >>> # Write a Orc file to the temporary directory, and read it back + >>> # Write an ORC file to a temporary directory and read it back. >>> import tempfile >>> with tempfile.TemporaryDirectory() as d: ... # Overwrite the path with a new Orc file ... spark.createDataFrame( ... [{"age": None, "name": "Michael"}, {"age": 30, "name": "Andy"}] ... ).write.mode("overwrite").format("orc").save(d) - ... # Read the Orc file as a DataFrame + ... # Read the ORC file as a DataFrame ... df = spark.read.format("orc").load(d) ... df.show() +----+-------+