From e6ff1ae3e291fa2610779b4227f4e9d0c080b905 Mon Sep 17 00:00:00 2001 From: kmitchener Date: Mon, 1 Aug 2022 13:07:14 -0400 Subject: [PATCH] moved python docs from DataFusion project --- docs/source/python/api.rst | 30 +++ docs/source/python/api/dataframe.rst | 27 ++ docs/source/python/api/execution_context.rst | 27 ++ docs/source/python/api/expression.rst | 27 ++ docs/source/python/api/functions.rst | 27 ++ .../python/generated/datafusion.DataFrame.rst | 33 +++ .../generated/datafusion.Expression.rst | 28 ++ .../generated/datafusion.SessionContext.rst | 34 +++ .../python/generated/datafusion.functions.rst | 23 ++ docs/source/python/index.rst | 251 ++++++++++++++++++ 10 files changed, 507 insertions(+) create mode 100644 docs/source/python/api.rst create mode 100644 docs/source/python/api/dataframe.rst create mode 100644 docs/source/python/api/execution_context.rst create mode 100644 docs/source/python/api/expression.rst create mode 100644 docs/source/python/api/functions.rst create mode 100644 docs/source/python/generated/datafusion.DataFrame.rst create mode 100644 docs/source/python/generated/datafusion.Expression.rst create mode 100644 docs/source/python/generated/datafusion.SessionContext.rst create mode 100644 docs/source/python/generated/datafusion.functions.rst create mode 100644 docs/source/python/index.rst diff --git a/docs/source/python/api.rst b/docs/source/python/api.rst new file mode 100644 index 000000000..f81753e08 --- /dev/null +++ b/docs/source/python/api.rst @@ -0,0 +1,30 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _api: + +************* +API Reference +************* + +.. toctree:: + :maxdepth: 2 + + api/dataframe + api/execution_context + api/expression + api/functions diff --git a/docs/source/python/api/dataframe.rst b/docs/source/python/api/dataframe.rst new file mode 100644 index 000000000..0a3c4c8b1 --- /dev/null +++ b/docs/source/python/api/dataframe.rst @@ -0,0 +1,27 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _api.dataframe: +.. currentmodule:: datafusion + +DataFrame +========= + +.. autosummary:: + :toctree: ../generated/ + + DataFrame diff --git a/docs/source/python/api/execution_context.rst b/docs/source/python/api/execution_context.rst new file mode 100644 index 000000000..5b7e0f82f --- /dev/null +++ b/docs/source/python/api/execution_context.rst @@ -0,0 +1,27 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _api.execution_context: +.. currentmodule:: datafusion + +SessionContext +================ + +.. autosummary:: + :toctree: ../generated/ + + SessionContext diff --git a/docs/source/python/api/expression.rst b/docs/source/python/api/expression.rst new file mode 100644 index 000000000..45923fb54 --- /dev/null +++ b/docs/source/python/api/expression.rst @@ -0,0 +1,27 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _api.expression: +.. currentmodule:: datafusion + +Expression +========== + +.. autosummary:: + :toctree: ../generated/ + + Expression diff --git a/docs/source/python/api/functions.rst b/docs/source/python/api/functions.rst new file mode 100644 index 000000000..6f10d826e --- /dev/null +++ b/docs/source/python/api/functions.rst @@ -0,0 +1,27 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _api.functions: +.. currentmodule:: datafusion + +Functions +========= + +.. autosummary:: + :toctree: ../generated/ + + functions diff --git a/docs/source/python/generated/datafusion.DataFrame.rst b/docs/source/python/generated/datafusion.DataFrame.rst new file mode 100644 index 000000000..365f5931d --- /dev/null +++ b/docs/source/python/generated/datafusion.DataFrame.rst @@ -0,0 +1,33 @@ +datafusion.DataFrame +==================== + +.. currentmodule:: datafusion + +.. autoclass:: DataFrame + + + .. automethod:: __init__ + + + .. rubric:: Methods + + .. autosummary:: + + ~DataFrame.__init__ + ~DataFrame.aggregate + ~DataFrame.collect + ~DataFrame.explain + ~DataFrame.filter + ~DataFrame.join + ~DataFrame.limit + ~DataFrame.schema + ~DataFrame.select + ~DataFrame.select_columns + ~DataFrame.show + ~DataFrame.sort + + + + + + \ No newline at end of file diff --git a/docs/source/python/generated/datafusion.Expression.rst b/docs/source/python/generated/datafusion.Expression.rst new file mode 100644 index 000000000..427fed0d3 --- /dev/null +++ b/docs/source/python/generated/datafusion.Expression.rst @@ -0,0 +1,28 @@ +datafusion.Expression +===================== + +.. currentmodule:: datafusion + +.. autoclass:: Expression + + + .. automethod:: __init__ + + + .. rubric:: Methods + + .. autosummary:: + + ~Expression.__init__ + ~Expression.alias + ~Expression.cast + ~Expression.column + ~Expression.is_null + ~Expression.literal + ~Expression.sort + + + + + + \ No newline at end of file diff --git a/docs/source/python/generated/datafusion.SessionContext.rst b/docs/source/python/generated/datafusion.SessionContext.rst new file mode 100644 index 000000000..86b942f20 --- /dev/null +++ b/docs/source/python/generated/datafusion.SessionContext.rst @@ -0,0 +1,34 @@ +datafusion.SessionContext +========================= + +.. currentmodule:: datafusion + +.. autoclass:: SessionContext + + + .. automethod:: __init__ + + + .. rubric:: Methods + + .. autosummary:: + + ~SessionContext.__init__ + ~SessionContext.catalog + ~SessionContext.create_dataframe + ~SessionContext.deregister_table + ~SessionContext.empty_table + ~SessionContext.register_csv + ~SessionContext.register_parquet + ~SessionContext.register_record_batches + ~SessionContext.register_table + ~SessionContext.register_udf + ~SessionContext.sql + ~SessionContext.table + ~SessionContext.tables + + + + + + \ No newline at end of file diff --git a/docs/source/python/generated/datafusion.functions.rst b/docs/source/python/generated/datafusion.functions.rst new file mode 100644 index 000000000..4bac3c33a --- /dev/null +++ b/docs/source/python/generated/datafusion.functions.rst @@ -0,0 +1,23 @@ +datafusion.functions +==================== + +.. automodule:: datafusion.functions + + + + + + + + + + + + + + + + + + + diff --git a/docs/source/python/index.rst b/docs/source/python/index.rst new file mode 100644 index 000000000..167e66b9f --- /dev/null +++ b/docs/source/python/index.rst @@ -0,0 +1,251 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +==================== +DataFusion in Python +==================== + +This is a Python library that binds to `Apache Arrow `_ in-memory query engine `DataFusion `_. + +Like pyspark, it allows you to build a plan through SQL or a DataFrame API against in-memory data, parquet or CSV files, run it in a multi-threaded environment, and obtain the result back in Python. + +It also allows you to use UDFs and UDAFs for complex operations. + +The major advantage of this library over other execution engines is that this library achieves zero-copy between Python and its execution engine: there is no cost in using UDFs, UDAFs, and collecting the results to Python apart from having to lock the GIL when running those operations. + +Its query engine, DataFusion, is written in `Rust `_, which makes strong assumptions about thread safety and lack of memory leaks. + +Technically, zero-copy is achieved via the `c data interface `_. + +How to use it +============= + +Simple usage: + +.. code-block:: python + + import datafusion + from datafusion import functions as f + from datafusion import col + import pyarrow + + # create a context + ctx = datafusion.SessionContext() + + # create a RecordBatch and a new DataFrame from it + batch = pyarrow.RecordBatch.from_arrays( + [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])], + names=["a", "b"], + ) + df = ctx.create_dataframe([[batch]]) + + # create a new statement + df = df.select( + col("a") + col("b"), + col("a") - col("b"), + ) + + # execute and collect the first (and only) batch + result = df.collect()[0] + + assert result.column(0) == pyarrow.array([5, 7, 9]) + assert result.column(1) == pyarrow.array([-3, -3, -3]) + + +We can also execute a query against data stored in CSV + +.. code-block:: bash + + echo "a,b\n1,4\n2,5\n3,6" > example.csv + + +.. code-block:: python + + import datafusion + from datafusion import functions as f + from datafusion import col + import pyarrow + + # create a context + ctx = datafusion.SessionContext() + + # register a CSV + ctx.register_csv('example', 'example.csv') + + # create a new statement + df = ctx.table('example').select( + col("a") + col("b"), + col("a") - col("b"), + ) + + # execute and collect the first (and only) batch + result = df.collect()[0] + + assert result.column(0) == pyarrow.array([5, 7, 9]) + assert result.column(1) == pyarrow.array([-3, -3, -3]) + + +And how to execute a query against a CSV using SQL: + + +.. code-block:: python + + import datafusion + from datafusion import functions as f + from datafusion import col + import pyarrow + + # create a context + ctx = datafusion.SessionContext() + + # register a CSV + ctx.register_csv('example', 'example.csv') + + # create a new statement via SQL + df = ctx.sql("SELECT a+b, a-b FROM example") + + # execute and collect the first (and only) batch + result = df.collect()[0] + + assert result.column(0) == pyarrow.array([5, 7, 9]) + assert result.column(1) == pyarrow.array([-3, -3, -3]) + + + +UDFs +---- + +.. code-block:: python + + def is_null(array: pyarrow.Array) -> pyarrow.Array: + return array.is_null() + + udf = f.udf(is_null, [pyarrow.int64()], pyarrow.bool_()) + + df = df.select(udf(col("a"))) + + +UDAF +---- + +.. code-block:: python + + import pyarrow + import pyarrow.compute + + + class Accumulator: + """ + Interface of a user-defined accumulation. + """ + def __init__(self): + self._sum = pyarrow.scalar(0.0) + + def to_scalars(self) -> [pyarrow.Scalar]: + return [self._sum] + + def update(self, values: pyarrow.Array) -> None: + # not nice since pyarrow scalars can't be summed yet. This breaks on `None` + self._sum = pyarrow.scalar(self._sum.as_py() + pyarrow.compute.sum(values).as_py()) + + def merge(self, states: pyarrow.Array) -> None: + # not nice since pyarrow scalars can't be summed yet. This breaks on `None` + self._sum = pyarrow.scalar(self._sum.as_py() + pyarrow.compute.sum(states).as_py()) + + def evaluate(self) -> pyarrow.Scalar: + return self._sum + + + df = ... + + udaf = f.udaf(Accumulator, pyarrow.float64(), pyarrow.float64(), [pyarrow.float64()]) + + df = df.aggregate( + [], + [udaf(col("a"))] + ) + + +How to install (from pip) +========================= + +.. code-block:: shell + + pip install datafusion + + +How to develop +============== + +This assumes that you have rust and cargo installed. We use the workflow recommended by `pyo3 `_ and `maturin `_. + +Bootstrap: + +.. code-block:: shell + + # fetch this repo + git clone git@github.com:apache/arrow-datafusion.git + + cd arrow-datafusion/python + + # prepare development environment (used to build wheel / install in development) + python3 -m venv venv + # activate the venv + source venv/bin/activate + pip install -r requirements.txt + + +Whenever rust code changes (your changes or via `git pull`): + +.. code-block:: shell + + # make sure you activate the venv using "source venv/bin/activate" first + maturin develop + python -m pytest + + +How to update dependencies +========================== + +To change test dependencies, change the `requirements.in` and run + +.. code-block:: shell + + # install pip-tools (this can be done only once), also consider running in venv + pip install pip-tools + + # change requirements.in and then run + pip-compile --generate-hashes + + +To update dependencies, run + +.. code-block:: shell + + pip-compile update + + +More details about pip-tools `here `_ + + +API reference +============= + +.. toctree:: + :maxdepth: 2 + + api