diff --git a/python/docs/source/user_guide/sql/creating_dataframes.rst b/python/docs/source/user_guide/sql/creating_dataframes.rst new file mode 100644 index 0000000000000..e81f27f2b4d46 --- /dev/null +++ b/python/docs/source/user_guide/sql/creating_dataframes.rst @@ -0,0 +1,223 @@ +.. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + + +=================== +Creating DataFrames +=================== + +.. currentmodule:: pyspark.sql + +PySpark allows you to create :class:`DataFrame`\s in several ways. Let's explore these methods with simple examples. + +Creating a :class:`DataFrame` from Lists +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + :emphasize-lines: 5 + + >>> # when the schema is not provided, the resulting DataFrame has _1 and _2 as the schema + >>> df = spark.createDataFrame([['Alice', 1], ['Bob', 5]]) + >>> df.show() + +-----+---+ + | _1| _2| + +-----+---+ + |Alice| 1| + | Bob| 5| + +-----+---+ + + +Creating a :class:`DataFrame` with a Specified Schema +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Define a schema and use it to create a :class:`DataFrame`. A schema describes the column names and types. + +.. code-block:: python + + >>> from pyspark.sql.types import StructType, StructField, StringType, IntegerType + >>> schema = StructType([ + ... StructField("name", StringType(), True), + ... StructField("age", IntegerType(), True) + ... ]) + >>> df = spark.createDataFrame([('Alice', 1), ('Bob', 5)], schema) + >>> df.show() + +-----+---+ + | name|age| + +-----+---+ + |Alice| 1| + | Bob| 5| + +-----+---+ + +Use DDL-formatted string schema to create a :class:`DataFrame`. + +.. code-block:: python + + >>> df = spark.createDataFrame([('Alice', 1), ('Bob', 5)], schema = "name string, age int") + >>> df.show() + +-----+---+ + | name|age| + +-----+---+ + |Alice| 1| + | Bob| 5| + +-----+---+ + + +Creating a :class:`DataFrame` from Dictionaries +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Dictionaries with keys as column names can also be used. + +.. code-block:: python + + >>> df = spark.createDataFrame([{'name': 'Alice', 'age': 1}]) + >>> df.show() + +---+-----+ + |age| name| + +---+-----+ + | 1|Alice| + +---+-----+ + + +Creating a :class:`DataFrame` from :class:`Row`\s +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Use the :class:`Row` type to define rows of a :class:`DataFrame`. + +.. code-block:: python + + >>> from pyspark.sql import Row + >>> Person = Row('name', 'age') + >>> df = spark.createDataFrame([Person("Alice", 1), Person("Bob", 5)]) + >>> df.show() + +-----+---+ + | name|age| + +-----+---+ + |Alice| 1| + | Bob| 5| + +-----+---+ + + +Creating a :class:`DataFrame` from a :class:`pandas.DataFrame` or a :class:`numpy.ndarray` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + >>> import pandas as pd + >>> df = spark.createDataFrame(pd.DataFrame([[1, 2]])) + >>> df.show() + +---+---+ + | 0| 1| + +---+---+ + | 1| 2| + +---+---+ + + +.. code-block:: python + + >>> import numpy as np + >>> import pandas as pd + >>> df = spark.createDataFrame(pd.DataFrame(data=np.array([[1, 2], [3, 4]]), + ... columns=['a', 'b'])) + >>> df.show() + +---+---+ + | a| b| + +---+---+ + | 1| 2| + | 3| 4| + +---+---+ + + +Reading Data from Files +----------------------- + +Example with **JSON** +~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + >>> df = spark.read.format("json").load("python/test_support/sql/people.json") + >>> df.show() + +----+-------+ + | age| name| + +----+-------+ + |NULL|Michael| + | 30| Andy| + | 19| Justin| + +----+-------+ + + +Example with **CSV** +~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + >>> df = spark.read.format("csv").option("header", "true").load( + ... "python/test_support/sql/people.csv") + >>> df.show() + +----+-------+ + | age| name| + +----+-------+ + |NULL|Michael| + | 30| Andy| + | 19| Justin| + +----+-------+ + + +Example with **PARQUET** +~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + >>> # Write a Parquet file to a temporary directory and read it back. + >>> import tempfile + >>> with tempfile.TemporaryDirectory() as d: + ... # Overwrite the path with a new Parquet file + ... spark.createDataFrame( + ... [{"age": None, "name": "Michael"}, {"age": 30, "name": "Andy"}] + ... ).write.mode("overwrite").format("parquet").save(d) + ... # Read the Parquet file as a DataFrame + ... df = spark.read.format("parquet").load(d) + ... df.show() + +----+-------+ + | age| name| + +----+-------+ + | 30| Andy| + |NULL|Michael| + +----+-------+ + +Example with **ORC** +~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + >>> # Write an ORC file to a temporary directory and read it back. + >>> import tempfile + >>> with tempfile.TemporaryDirectory() as d: + ... # Overwrite the path with a new Orc file + ... spark.createDataFrame( + ... [{"age": None, "name": "Michael"}, {"age": 30, "name": "Andy"}] + ... ).write.mode("overwrite").format("orc").save(d) + ... # Read the ORC file as a DataFrame + ... df = spark.read.format("orc").load(d) + ... df.show() + +----+-------+ + | age| name| + +----+-------+ + | 30| Andy| + |NULL|Michael| + +----+-------+ + diff --git a/python/docs/source/user_guide/sql/index.rst b/python/docs/source/user_guide/sql/index.rst index 118cf139d9b38..1d3c9057f99e5 100644 --- a/python/docs/source/user_guide/sql/index.rst +++ b/python/docs/source/user_guide/sql/index.rst @@ -16,13 +16,14 @@ under the License. -========= -Spark SQL -========= +======================== +Spark SQL and DataFrames +======================== .. toctree:: :maxdepth: 2 + creating_dataframes arrow_pandas python_udtf type_conversions