apache · timsaucer · Sep 2, 2025 · Sep 1, 2025 · Sep 1, 2025 · Sep 1, 2025
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -48,6 +48,10 @@ jobs:
           uv run --no-project ruff check --output-format=github python/
           uv run --no-project ruff format --check python/
 
+      - name: Run codespell
+        run: |
+            uv run --no-project codespell --toml pyproject.toml
+
   generate-license:
     runs-on: ubuntu-latest
     steps:

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -45,5 +45,13 @@ repos:
             types: [file, rust]
             language: system
 
+      - repo: https://github.com/codespell-project/codespell
+        rev: v2.4.1
+        hooks:
+          - id: codespell
+            args: [ --toml, "pyproject.toml"]
+            additional_dependencies:
+              - tomli
+
 default_language_version:
       python: python3
diff --git a/README.md b/README.md
@@ -233,7 +233,7 @@ and for `uv run` commands the additional parameter `--no-project`
 git clone git@github.com:apache/datafusion-python.git
 # cd to the repo root
 cd datafusion-python/
-# create the virtual enviornment
+# create the virtual environment
 uv sync --dev --no-install-package datafusion
 # activate the environment
 source .venv/bin/activate

diff --git a/docs/source/contributor-guide/ffi.rst b/docs/source/contributor-guide/ffi.rst
@@ -195,7 +195,7 @@ optimization levels. If you wish to go down this route, there are two approaches
 have identified you can use.
 
 #. Re-export all of ``datafusion-python`` yourself with your extensions built in.
-#. Carefully synchonize your software releases with the ``datafusion-python`` CI build
+#. Carefully synchronize your software releases with the ``datafusion-python`` CI build
    system so that your libraries use the exact same compiler, features, and
    optimization level.
 

diff --git a/docs/source/contributor-guide/introduction.rst b/docs/source/contributor-guide/introduction.rst
@@ -43,7 +43,7 @@ Bootstrap:
 
     # fetch this repo
     git clone git@github.com:apache/datafusion-python.git
-    # create the virtual enviornment
+    # create the virtual environment
     uv sync --dev --no-install-package datafusion
     # activate the environment
     source .venv/bin/activate

diff --git a/docs/source/user-guide/common-operations/expressions.rst b/docs/source/user-guide/common-operations/expressions.rst
@@ -64,7 +64,7 @@ Arrays
 ------
 
 For columns that contain arrays of values, you can access individual elements of the array by index
-using bracket indexing. This is similar to callling the function
+using bracket indexing. This is similar to calling the function
 :py:func:`datafusion.functions.array_element`, except that array indexing using brackets is 0 based,
 similar to Python arrays and ``array_element`` is 1 based indexing to be compatible with other SQL
 approaches.

diff --git a/docs/source/user-guide/common-operations/windows.rst b/docs/source/user-guide/common-operations/windows.rst
@@ -24,7 +24,7 @@ In this section you will learn about window functions. A window function utilize
 multiple rows to produce a result for each individual row, unlike an aggregate function that
 provides a single value for multiple rows.
 
-The window functions are availble in the :py:mod:`~datafusion.functions` module.
+The window functions are available in the :py:mod:`~datafusion.functions` module.
 
 We'll use the pokemon dataset (from Ritchie Vink) in the following examples.
 
@@ -99,8 +99,8 @@ If you do not specify a Window Frame, the frame will be set depending on the fol
 criteria.
 
 * If an ``order_by`` clause is set, the default window frame is defined as the rows between
-  unbounded preceeding and the current row.
-* If an ``order_by`` is not set, the default frame is defined as the rows betwene unbounded
+  unbounded preceding and the current row.
+* If an ``order_by`` is not set, the default frame is defined as the rows between unbounded
   and unbounded following (the entire partition).
 
 Window Frames are defined by three parameters: unit type, starting bound, and ending bound.
@@ -116,7 +116,7 @@ The unit types available are:
   ``order_by`` clause.
 
 In this example we perform a "rolling average" of the speed of the current Pokemon and the
-two preceeding rows.
+two preceding rows.
 
 .. ipython:: python
 

diff --git a/docs/source/user-guide/data-sources.rst b/docs/source/user-guide/data-sources.rst
@@ -25,7 +25,7 @@ DataFusion provides a wide variety of ways to get data into a DataFrame to perfo
 Local file
 ----------
 
-DataFusion has the abilty to read from a variety of popular file formats, such as :ref:`Parquet <io_parquet>`,
+DataFusion has the ability to read from a variety of popular file formats, such as :ref:`Parquet <io_parquet>`,
 :ref:`CSV <io_csv>`, :ref:`JSON <io_json>`, and :ref:`AVRO <io_avro>`.
 
 .. ipython:: python
@@ -120,7 +120,7 @@ DataFusion can import DataFrames directly from other libraries, such as
 `Polars <https://pola.rs/>`_ and `Pandas <https://pandas.pydata.org/>`_.
 Since DataFusion version 42.0.0, any DataFrame library that supports the Arrow FFI PyCapsule
 interface can be imported to DataFusion using the
-:py:func:`~datafusion.context.SessionContext.from_arrow` function. Older verions of Polars may
+:py:func:`~datafusion.context.SessionContext.from_arrow` function. Older versions of Polars may
 not support the arrow interface. In those cases, you can still import via the
 :py:func:`~datafusion.context.SessionContext.from_polars` function.
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -129,6 +129,18 @@ max-doc-length = 88
 "benchmarks/*" = ["D", "F", "T", "BLE", "FURB", "PLR", "E", "TD", "TRY", "S", "SIM", "EXE", "UP"]
 "docs/*" = ["D"]
 
+[tool.codespell]
+skip = [
+    "./target",
+    "uv.lock",
+    "./python/tests/test_functions.py"
+]
+count = true
+ignore-words-list = [
+    "ans",
+    "IST"
+]
+
 [dependency-groups]
 dev = [
     "maturin>=1.8.1",
@@ -139,6 +151,7 @@ dev = [
     "ruff>=0.9.1",
     "toml>=0.10.2",
     "pygithub==2.5.0",
+    "codespell==2.4.1",
 ]
 docs = [
     "sphinx>=7.1.2",

diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -588,7 +588,7 @@ def tail(self, n: int = 5) -> DataFrame:
     def collect(self) -> list[pa.RecordBatch]:
         """Execute this :py:class:`DataFrame` and collect results into memory.
 
-        Prior to calling ``collect``, modifying a DataFrme simply updates a plan
+        Prior to calling ``collect``, modifying a DataFrame simply updates a plan
         (no actual computation is performed). Calling ``collect`` triggers the
         computation.
 
@@ -767,7 +767,7 @@ def explain(self, verbose: bool = False, analyze: bool = False) -> None:
 
         Args:
             verbose: If ``True``, more details will be included.
-            analyze: If ``Tru`e``, the plan will run and metrics reported.
+            analyze: If ``True``, the plan will run and metrics reported.
         """
         self.df.explain(verbose, analyze)
 

diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py
@@ -1673,7 +1673,7 @@ def approx_percentile_cont(
     between two of the values.
 
     This function uses the [t-digest](https://arxiv.org/abs/1902.04023) algorithm to
-    compute the percentil. You can limit the number of bins used in this algorithm by
+    compute the percentile. You can limit the number of bins used in this algorithm by
     setting the ``num_centroids`` parameter.
 
     If using the builder functions described in ref:`_aggregation` this function ignores
@@ -2415,7 +2415,7 @@ def lead(
     Lead operation will return the argument that is in the next shift_offset-th row in
     the partition. For example ``lead(col("b"), shift_offset=3, default_value=5)`` will
     return the 3rd following value in column ``b``. At the end of the partition, where
-    no futher values can be returned it will return the default value of 5.
+    no further values can be returned it will return the default value of 5.
 
     Here is an example of both the ``lead`` and :py:func:`datafusion.functions.lag`
     functions on a simple DataFrame::
@@ -2467,7 +2467,7 @@ def lag(
 
     Lag operation will return the argument that is in the previous shift_offset-th row
     in the partition. For example ``lag(col("b"), shift_offset=3, default_value=5)``
-    will return the 3rd previous value in column ``b``. At the beginnig of the
+    will return the 3rd previous value in column ``b``. At the beginning of the
     partition, where no values can be returned it will return the default value of 5.
 
     Here is an example of both the ``lag`` and :py:func:`datafusion.functions.lead`
@@ -2548,7 +2548,7 @@ def rank(
 
     Returns the rank based upon the window order. Consecutive equal values will receive
     the same rank, but the next different value will not be consecutive but rather the
-    number of rows that preceed it plus one. This is similar to Olympic medals. If two
+    number of rows that precede it plus one. This is similar to Olympic medals. If two
     people tie for gold, the next place is bronze. There would be no silver medal. Here
     is an example of a dataframe with a window ordered by descending ``points`` and the
     associated rank.
@@ -2655,7 +2655,7 @@ def cume_dist(
     """Create a cumulative distribution window function.
 
     This window function is similar to :py:func:`rank` except that the returned values
-    are the ratio of the row number to the total numebr of rows. Here is an example of a
+    are the ratio of the row number to the total number of rows. Here is an example of a
     dataframe with a window ordered by descending ``points`` and the associated
     cumulative distribution::
 
@@ -2732,7 +2732,7 @@ def string_agg(
     """Concatenates the input strings.
 
     This aggregate function will concatenate input strings, ignoring null values, and
-    seperating them with the specified delimiter. Non-string values will be converted to
+    separating them with the specified delimiter. Non-string values will be converted to
     their string equivalents.
 
     If using the builder functions described in ref:`_aggregation` this function ignores

diff --git a/python/datafusion/user_defined.py b/python/datafusion/user_defined.py
@@ -528,7 +528,7 @@ def memoize(self) -> None:
         """
 
     def get_range(self, idx: int, num_rows: int) -> tuple[int, int]:  # noqa: ARG002
-        """Return the range for the window fuction.
+        """Return the range for the window function.
 
         If `uses_window_frame` flag is `false`. This method is used to
         calculate required range for the window function during

diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
@@ -809,8 +809,8 @@ def test_window_frame_defaults_match_postgres(partitioned_df):
 
     assert df_1.sort(col_a).to_pydict() == expected
 
-    # When order is not set, the default frame should be unounded preceeding to
-    # unbounded following. When order is set, the default frame is unbounded preceeding
+    # When order is not set, the default frame should be unbounded preceding to
+    # unbounded following. When order is set, the default frame is unbounded preceding
     # to current row.
     no_order = f.avg(col_a).over(Window()).alias("over_no_order")
     with_order = f.avg(col_a).over(Window(order_by=[col_a])).alias("over_with_order")
@@ -1084,14 +1084,14 @@ def test_html_formatter_repr_rows(df, clean_formatter_state):
     html_output = df._repr_html_()
 
     tr_count = count_table_rows(html_output)
-    # Tabe should have header row (1) + 2 data rows = 3 rows
+    # Table should have header row (1) + 2 data rows = 3 rows
     assert tr_count == 3
 
     configure_formatter(min_rows_display=2, repr_rows=3)
     html_output = df._repr_html_()
 
     tr_count = count_table_rows(html_output)
-    # Tabe should have header row (1) + 3 data rows = 4 rows
+    # Table should have header row (1) + 3 data rows = 4 rows
     assert tr_count == 4
 
 

diff --git a/src/common/data_type.rs b/src/common/data_type.rs
@@ -576,7 +576,7 @@ impl DataTypeMap {
     }
 
     /// Unfortunately PyO3 does not allow for us to expose the DataType as an enum since
-    /// we cannot directly annotae the Enum instance of dependency code. Therefore, here
+    /// we cannot directly annotate the Enum instance of dependency code. Therefore, here
     /// we provide an enum to mimic it.
     #[pyo3(name = "friendly_arrow_type_name")]
     pub fn friendly_arrow_type_name(&self) -> PyResult<&str> {

diff --git a/src/dataframe.rs b/src/dataframe.rs
@@ -1073,7 +1073,7 @@ fn record_batch_into_schema(
 /// This is a helper function to return the first non-empty record batch from executing a DataFrame.
 /// It additionally returns a bool, which indicates if there are more record batches available.
 /// We do this so we can determine if we should indicate to the user that the data has been
-/// truncated. This collects until we have achived both of these two conditions
+/// truncated. This collects until we have archived both of these two conditions
 ///
 /// - We have collected our minimum number of rows
 /// - We have reached our limit, either data size or maximum number of rows