From 3cf243f70f83265feee740f9739eb9c04eae76f6 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sat, 18 Apr 2026 14:33:51 -0400 Subject: [PATCH] Enrich module docstrings and add doctest examples Expands the module docstrings for `functions.py`, `dataframe.py`, `expr.py`, and `context.py` so each module opens with a concept summary, cross-references to related APIs, and a small executable example. Adds doctest examples to the high-traffic `DataFrame` methods that previously lacked them: `select`, `aggregate`, `sort`, `limit`, `join`, and `union`. Optional parameters are demonstrated with keyword syntax, and examples reuse the same input data across variants so the effect of each option is easy to see. Co-Authored-By: Claude Opus 4.7 (1M context) --- python/datafusion/context.py | 27 ++++++- python/datafusion/dataframe.py | 135 ++++++++++++++++++++++++++++++--- python/datafusion/expr.py | 28 ++++++- python/datafusion/functions.py | 22 +++++- 4 files changed, 197 insertions(+), 15 deletions(-) diff --git a/python/datafusion/context.py b/python/datafusion/context.py index c3f94cc16..dd6790402 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -15,7 +15,32 @@ # specific language governing permissions and limitations # under the License. -"""Session Context and it's associated configuration.""" +""":py:class:`SessionContext` — entry point for running DataFusion queries. + +A :py:class:`SessionContext` holds registered tables, catalogs, and +configuration for the current session. It is the first object most programs +create: from it you register data, run SQL strings +(:py:meth:`SessionContext.sql`), read files +(:py:meth:`SessionContext.read_csv`, +:py:meth:`SessionContext.read_parquet`, ...), and construct +:py:class:`~datafusion.dataframe.DataFrame` objects in memory +(:py:meth:`SessionContext.from_pydict`, +:py:meth:`SessionContext.from_arrow`). + +Session behavior (memory limits, batch size, configured optimizer passes, +...) is controlled by :py:class:`SessionConfig` and +:py:class:`RuntimeEnvBuilder`; SQL dialect limits are controlled by +:py:class:`SQLOptions`. + +Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2, 3]}) + >>> ctx.sql("SELECT 1 AS n").to_pydict() + {'n': [1]} + +See :ref:`user_guide_concepts` in the online documentation for the broader +execution model. +""" from __future__ import annotations diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index c00c85fdb..53d5fea56 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -14,9 +14,32 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -""":py:class:`DataFrame` is one of the core concepts in DataFusion. - -See :ref:`user_guide_concepts` in the online documentation for more information. +""":py:class:`DataFrame` — lazy, chainable query representation. + +A :py:class:`DataFrame` is a logical plan over one or more data sources. +Methods that reshape the plan (:py:meth:`DataFrame.select`, +:py:meth:`DataFrame.filter`, :py:meth:`DataFrame.aggregate`, +:py:meth:`DataFrame.sort`, :py:meth:`DataFrame.join`, +:py:meth:`DataFrame.limit`, the set-operation methods, ...) return a new +:py:class:`DataFrame` and do no work until a terminal method such as +:py:meth:`DataFrame.collect`, :py:meth:`DataFrame.to_pydict`, +:py:meth:`DataFrame.show`, or one of the ``write_*`` methods is called. + +DataFrames are produced from a +:py:class:`~datafusion.context.SessionContext`, typically via +:py:meth:`~datafusion.context.SessionContext.sql`, +:py:meth:`~datafusion.context.SessionContext.read_csv`, +:py:meth:`~datafusion.context.SessionContext.read_parquet`, or +:py:meth:`~datafusion.context.SessionContext.from_pydict`. + +Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2, 3], "b": [10, 20, 30]}) + >>> df.filter(col("a") > lit(1)).select("b").to_pydict() + {'b': [20, 30]} + +See :ref:`user_guide_concepts` in the online documentation for a high-level +overview of the execution model. """ from __future__ import annotations @@ -503,21 +526,29 @@ def select_exprs(self, *args: str) -> DataFrame: def select(self, *exprs: Expr | str) -> DataFrame: """Project arbitrary expressions into a new :py:class:`DataFrame`. + String arguments are treated as column names; :py:class:`~datafusion.expr.Expr` + arguments can reshape, rename, or compute new columns. + Args: exprs: Either column names or :py:class:`~datafusion.expr.Expr` to select. Returns: DataFrame after projection. It has one column for each expression. - Example usage: + Examples: + Select columns by name: - The following example will return 3 columns from the original dataframe. - The first two columns will be the original column ``a`` and ``b`` since the - string "a" is assumed to refer to column selection. Also a duplicate of - column ``a`` will be returned with the column name ``alternate_a``:: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2, 3], "b": [10, 20, 30]}) + >>> df.select("a").to_pydict() + {'a': [1, 2, 3]} - df = df.select("a", col("b"), col("a").alias("alternate_a")) + Mix column names, expressions, and aliases. The string ``"a"`` selects + column ``a`` directly; ``col("a").alias("alternate_a")`` returns a + duplicate under a new name: + >>> df.select("a", col("b"), col("a").alias("alternate_a")).to_pydict() + {'a': [1, 2, 3], 'b': [10, 20, 30], 'alternate_a': [1, 2, 3]} """ exprs_internal = expr_list_to_raw_expr_list(exprs) return DataFrame(self.df.select(*exprs_internal)) @@ -766,6 +797,24 @@ def aggregate( Returns: DataFrame after aggregation. + + Examples: + Aggregate without grouping — an empty ``group_by`` produces a + single row: + + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict( + ... {"team": ["x", "x", "y"], "score": [1, 2, 3]} + ... ) + >>> df.aggregate([], [F.sum(col("score")).alias("total")]).to_pydict() + {'total': [6]} + + Group by a column and produce one row per group: + + >>> df.aggregate( + ... [col("team")], [F.sum(col("score")).alias("total")] + ... ).sort("team").to_pydict() + {'team': ['x', 'y'], 'total': [3, 3]} """ group_by_list = ( list(group_by) @@ -786,13 +835,27 @@ def sort(self, *exprs: SortKey) -> DataFrame: """Sort the DataFrame by the specified sorting expressions or column names. Note that any expression can be turned into a sort expression by - calling its ``sort`` method. + calling its ``sort`` method. For ascending-only sorts, the shorter + :py:meth:`sort_by` is usually more convenient. Args: exprs: Sort expressions or column names, applied in order. Returns: DataFrame after sorting. + + Examples: + Sort ascending by a column name: + + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [3, 1, 2], "b": [10, 20, 30]}) + >>> df.sort("a").to_pydict() + {'a': [1, 2, 3], 'b': [20, 30, 10]} + + Sort descending using :py:meth:`Expr.sort`: + + >>> df.sort(col("a").sort(ascending=False)).to_pydict() + {'a': [3, 2, 1], 'b': [10, 30, 20]} """ exprs_raw = sort_list_to_raw_sort_list(exprs) return DataFrame(self.df.sort(*exprs_raw)) @@ -812,12 +875,28 @@ def cast(self, mapping: dict[str, pa.DataType[Any]]) -> DataFrame: def limit(self, count: int, offset: int = 0) -> DataFrame: """Return a new :py:class:`DataFrame` with a limited number of rows. + Results are returned in unspecified order unless the DataFrame is + explicitly sorted first via :py:meth:`sort` or :py:meth:`sort_by`. + Args: count: Number of rows to limit the DataFrame to. offset: Number of rows to skip. Returns: DataFrame after limiting. + + Examples: + Take the first two rows: + + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2, 3, 4]}).sort("a") + >>> df.limit(2).to_pydict() + {'a': [1, 2]} + + Skip the first row then take two (paging): + + >>> df.limit(2, offset=1).to_pydict() + {'a': [2, 3]} """ return DataFrame(self.df.limit(count, offset)) @@ -972,6 +1051,28 @@ def join( Returns: DataFrame after join. + + Examples: + Inner-join two DataFrames on a shared column: + + >>> ctx = dfn.SessionContext() + >>> left = ctx.from_pydict({"id": [1, 2, 3], "val": [10, 20, 30]}) + >>> right = ctx.from_pydict({"id": [2, 3, 4], "label": ["b", "c", "d"]}) + >>> left.join(right, on="id").sort("id").to_pydict() + {'id': [2, 3], 'val': [20, 30], 'label': ['b', 'c']} + + Left join to keep all rows from the left side: + + >>> left.join(right, on="id", how="left").sort("id").to_pydict() + {'id': [1, 2, 3], 'val': [10, 20, 30], 'label': [None, 'b', 'c']} + + Use ``left_on`` / ``right_on`` when the key columns differ in name: + + >>> right2 = ctx.from_pydict({"rid": [2, 3], "label": ["b", "c"]}) + >>> left.join( + ... right2, left_on="id", right_on="rid" + ... ).sort("id").to_pydict() + {'id': [2, 3], 'val': [20, 30], 'rid': [2, 3], 'label': ['b', 'c']} """ if join_keys is not None: warnings.warn( @@ -1165,6 +1266,20 @@ def union(self, other: DataFrame, distinct: bool = False) -> DataFrame: Returns: DataFrame after union. + + Examples: + Stack rows from both DataFrames, preserving duplicates: + + >>> ctx = dfn.SessionContext() + >>> df1 = ctx.from_pydict({"a": [1, 2]}) + >>> df2 = ctx.from_pydict({"a": [2, 3]}) + >>> df1.union(df2).sort("a").to_pydict() + {'a': [1, 2, 2, 3]} + + Deduplicate the combined result with ``distinct=True``: + + >>> df1.union(df2, distinct=True).sort("a").to_pydict() + {'a': [1, 2, 3]} """ return DataFrame(self.df.union(other.df, distinct)) diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 32004656f..1ff6976f7 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -15,9 +15,31 @@ # specific language governing permissions and limitations # under the License. -"""This module supports expressions, one of the core concepts in DataFusion. - -See :ref:`Expressions` in the online documentation for more details. +""":py:class:`Expr` — the logical expression type used to build DataFusion queries. + +An :py:class:`Expr` represents a computation over columns or literals: a +column reference (``col("a")``), a literal (``lit(5)``), an operator +combination (``col("a") + lit(1)``), or the output of a function from +:py:mod:`datafusion.functions`. Expressions are passed to +:py:class:`~datafusion.dataframe.DataFrame` methods such as +:py:meth:`~datafusion.dataframe.DataFrame.select`, +:py:meth:`~datafusion.dataframe.DataFrame.filter`, +:py:meth:`~datafusion.dataframe.DataFrame.aggregate`, and +:py:meth:`~datafusion.dataframe.DataFrame.sort`. + +Convenience constructors are re-exported at the package level: +:py:func:`datafusion.col` / :py:func:`datafusion.column` for column references +and :py:func:`datafusion.lit` / :py:func:`datafusion.literal` for scalar +literals. + +Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2, 3]}) + >>> df.select((col("a") * lit(10)).alias("ten_a")).to_pydict() + {'ten_a': [10, 20, 30]} + +See :ref:`expressions` in the online documentation for details on available +operators and helpers. """ # ruff: noqa: PLC0415 diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 841cd9c0b..280a6d3ac 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -14,7 +14,27 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -"""User functions for operating on :py:class:`~datafusion.expr.Expr`.""" +"""Scalar, aggregate, and window functions for :py:class:`~datafusion.expr.Expr`. + +Each function returns an :py:class:`~datafusion.expr.Expr` that can be combined +with other expressions and passed to +:py:class:`~datafusion.dataframe.DataFrame` methods such as +:py:meth:`~datafusion.dataframe.DataFrame.select`, +:py:meth:`~datafusion.dataframe.DataFrame.filter`, +:py:meth:`~datafusion.dataframe.DataFrame.aggregate`, and +:py:meth:`~datafusion.dataframe.DataFrame.window`. The module is conventionally +imported as ``F`` so calls read like ``F.sum(col("price"))``. + +Examples: + >>> from datafusion import functions as F + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2, 3, 4]}) + >>> df.aggregate([], [F.sum(col("a")).alias("total")]).to_pydict() + {'total': [10]} + +See :ref:`aggregation` and :ref:`window_functions` in the online documentation +for categorized catalogs of aggregate and window functions. +""" from __future__ import annotations