From 95cccca47e12fb2cdb75b05e3ada8259f5eab2d1 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 13 May 2026 07:07:09 -0600 Subject: [PATCH 01/20] docs: scaffold Sphinx documentation site --- .gitignore | 11 +++++ docs/Makefile | 31 ++++++++++++++ docs/README.md | 49 ++++++++++++++++++++++ docs/build.sh | 31 ++++++++++++++ docs/make.bat | 35 ++++++++++++++++ docs/requirements.txt | 20 +++++++++ docs/source/_static/.gitkeep | 0 docs/source/_templates/.gitkeep | 0 docs/source/conf.py | 56 ++++++++++++++++++++++++++ docs/source/contributor-guide/index.md | 22 ++++++++++ docs/source/index.md | 29 +++++++++++++ docs/source/user-guide/index.md | 22 ++++++++++ 12 files changed, 306 insertions(+) create mode 100644 .gitignore create mode 100644 docs/Makefile create mode 100644 docs/README.md create mode 100755 docs/build.sh create mode 100644 docs/make.bat create mode 100644 docs/requirements.txt create mode 100644 docs/source/_static/.gitkeep create mode 100644 docs/source/_templates/.gitkeep create mode 100644 docs/source/conf.py create mode 100644 docs/source/contributor-guide/index.md create mode 100644 docs/source/index.md create mode 100644 docs/source/user-guide/index.md diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..009119e --- /dev/null +++ b/.gitignore @@ -0,0 +1,11 @@ +target/ +*.class +.idea/ +.vscode/ +*.iml +.DS_Store +tpch-data/ +.claude +docs/superpowers +docs/build/ +docs/venv/ diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..714088d --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,31 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Minimal makefile for Sphinx documentation + +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..83c5f37 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,49 @@ + + +# Apache DataFusion Java Documentation + +This directory contains the Sphinx source for the Apache DataFusion Java +documentation site. + +## Build + +Building the docs requires Python 3.9 or newer. A virtual environment under +`docs/venv/` is the recommended workflow. + +```sh +cd docs +python3 -m venv venv +source venv/bin/activate +pip install -r requirements.txt +./build.sh +``` + +The generated site is written to `docs/build/html/`. Open +`docs/build/html/index.html` in a browser to preview. + +Subsequent builds need only: + +```sh +cd docs +source venv/bin/activate +./build.sh +``` + +`./build.sh` runs `sphinx-build` with `-W` so warnings fail the build. diff --git a/docs/build.sh b/docs/build.sh new file mode 100755 index 0000000..c487a09 --- /dev/null +++ b/docs/build.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e + +cd "$(dirname "$0")" + +rm -rf build + +if [ -d venv ]; then + # shellcheck disable=SC1091 + source venv/bin/activate +fi + +sphinx-build -b html -W --keep-going source build/html diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..dc1312a --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..6ac8d0e --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +sphinx>=7.0,<8.0 +myst-parser>=2.0,<4.0 +pydata-sphinx-theme>=0.16.1,<0.17.0 diff --git a/docs/source/_static/.gitkeep b/docs/source/_static/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/docs/source/_templates/.gitkeep b/docs/source/_templates/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000..cf86b40 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,56 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Sphinx configuration for the Apache DataFusion Java documentation.""" + +project = "Apache DataFusion Java" +copyright = "2026, Apache Software Foundation" +author = "Apache Software Foundation" + +extensions = [ + "sphinx.ext.mathjax", + "sphinx.ext.napoleon", + "myst_parser", +] + +source_suffix = { + ".md": "markdown", +} + +templates_path = ["_templates"] +exclude_patterns = [] + +html_theme = "pydata_sphinx_theme" +html_theme_options = { + "use_edit_page_button": False, + "show_toc_level": 2, +} + +html_context = { + "github_user": "apache", + "github_repo": "datafusion-java", + "github_version": "main", + "doc_path": "docs/source", +} + +html_static_path = ["_static"] + +# Auto-generate anchor links for headings h1, h2, h3. +myst_heading_anchors = 3 + +# Enable nice rendering of GitHub-style task lists. +myst_enable_extensions = ["tasklist"] diff --git a/docs/source/contributor-guide/index.md b/docs/source/contributor-guide/index.md new file mode 100644 index 0000000..bac4c60 --- /dev/null +++ b/docs/source/contributor-guide/index.md @@ -0,0 +1,22 @@ + + +# Contributor Guide + +This guide is under construction. diff --git a/docs/source/index.md b/docs/source/index.md new file mode 100644 index 0000000..afbc75a --- /dev/null +++ b/docs/source/index.md @@ -0,0 +1,29 @@ + + +# Apache DataFusion Java + +```{toctree} +:maxdepth: 1 +:caption: Documentation +:hidden: + +User Guide +Contributor Guide +``` diff --git a/docs/source/user-guide/index.md b/docs/source/user-guide/index.md new file mode 100644 index 0000000..62fd3fd --- /dev/null +++ b/docs/source/user-guide/index.md @@ -0,0 +1,22 @@ + + +# User Guide + +This guide is under construction. From 0dcab135e5ea600d7af4d319096bcb2851ecfbd7 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 13 May 2026 07:10:54 -0600 Subject: [PATCH 02/20] docs: add ASF license header to make.bat --- docs/make.bat | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/docs/make.bat b/docs/make.bat index dc1312a..08805d0 100644 --- a/docs/make.bat +++ b/docs/make.bat @@ -1,5 +1,22 @@ @ECHO OFF +@rem Licensed to the Apache Software Foundation (ASF) under one +@rem or more contributor license agreements. See the NOTICE file +@rem distributed with this work for additional information +@rem regarding copyright ownership. The ASF licenses this file +@rem to you under the Apache License, Version 2.0 (the +@rem "License"); you may not use this file except in compliance +@rem with the License. You may obtain a copy of the License at +@rem +@rem http://www.apache.org/licenses/LICENSE-2.0 +@rem +@rem Unless required by applicable law or agreed to in writing, +@rem software distributed under the License is distributed on an +@rem "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +@rem KIND, either express or implied. See the License for the +@rem specific language governing permissions and limitations +@rem under the License. + pushd %~dp0 REM Command file for Sphinx documentation From 19d166648de34b71bfb226e1e391fb7c7d327421 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 13 May 2026 07:11:55 -0600 Subject: [PATCH 03/20] docs: write landing page with toctree --- docs/source/index.md | 72 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 69 insertions(+), 3 deletions(-) diff --git a/docs/source/index.md b/docs/source/index.md index afbc75a..675021e 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -19,11 +19,77 @@ under the License. # Apache DataFusion Java +Java bindings for [Apache DataFusion]. Queries run in native Rust and results +return to the JVM as [Apache Arrow] batches via the Arrow C Data Interface. + +[Apache DataFusion]: https://datafusion.apache.org/ +[Apache Arrow]: https://arrow.apache.org/ + +> Early development: no releases yet, API will change. Bug reports and +> contributions welcome. + +## Quickstart + +```java +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.ipc.ArrowReader; +import org.apache.datafusion.DataFrame; +import org.apache.datafusion.SessionContext; + +try (var allocator = new RootAllocator(); + var ctx = new SessionContext()) { + + ctx.registerParquet("orders", "/path/to/orders.parquet"); + + try (DataFrame df = ctx.sql( + "SELECT o_orderpriority, COUNT(*) AS n " + + "FROM orders GROUP BY o_orderpriority"); + ArrowReader reader = df.collect(allocator)) { + while (reader.loadNextBatch()) { + var batch = reader.getVectorSchemaRoot(); + // ... + } + } +} +``` + +See the [User Guide](user-guide/index.md) for installation, the DataFrame and +SQL APIs, and Parquet ingestion. See the [Contributor Guide](contributor-guide/index.md) +for build, test, and release workflows. + +```{toctree} +:maxdepth: 1 +:caption: Links +:hidden: + +GitHub Repository +Issue Tracker +Apache DataFusion +Code of Conduct +``` + +```{toctree} +:maxdepth: 1 +:caption: User Guide +:hidden: + +user-guide/index +user-guide/installation +user-guide/quickstart +user-guide/sessioncontext +user-guide/dataframe +user-guide/parquet +user-guide/project-status +``` + ```{toctree} :maxdepth: 1 -:caption: Documentation +:caption: Contributor Guide :hidden: -User Guide -Contributor Guide +contributor-guide/index +contributor-guide/development +contributor-guide/code-style +contributor-guide/releasing +contributor-guide/updating-datafusion-version ``` From da500e7801fcab12b369d4ae01a2fd844e6803ba Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 13 May 2026 07:13:53 -0600 Subject: [PATCH 04/20] docs: add user guide installation page --- docs/source/user-guide/installation.md | 52 ++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 docs/source/user-guide/installation.md diff --git a/docs/source/user-guide/installation.md b/docs/source/user-guide/installation.md new file mode 100644 index 0000000..3141355 --- /dev/null +++ b/docs/source/user-guide/installation.md @@ -0,0 +1,52 @@ + + +# Installation + +Apache DataFusion Java has not yet published a release. Until the first +release, the only way to use the library is to build from source. + +## Requirements + +- **JDK 17 or newer.** Set `JAVA_HOME` to point at it. +- **Rust toolchain (stable).** Install via [rustup]. + +[rustup]: https://rustup.rs/ + +## Build from source + +```sh +git clone https://github.com/apache/datafusion-java.git +cd datafusion-java +make test +``` + +`make test` compiles the native Rust crate, then runs the JUnit tests +against it. The native library must be built before the JVM tests can +run. + +The first build in a fresh checkout reaches out to +`raw.githubusercontent.com` to fetch the DataFusion `.proto` files used to +generate the `datafusion-proto` Java classes. Subsequent builds are +offline; the `download-maven-plugin` cache under +`~/.m2/repository/.cache/` satisfies them. + +For development workflow details — running individual tests, the TPC-H +integration test data, code style, and how to update the underlying +DataFusion version — see the [Contributor Guide](../contributor-guide/development.md). From dbc55e35bf1cf05685efc3113eba60109f399254 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 13 May 2026 07:15:11 -0600 Subject: [PATCH 05/20] docs: write user guide landing page --- docs/source/user-guide/index.md | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/docs/source/user-guide/index.md b/docs/source/user-guide/index.md index 62fd3fd..d5cbeb9 100644 --- a/docs/source/user-guide/index.md +++ b/docs/source/user-guide/index.md @@ -19,4 +19,23 @@ under the License. # User Guide -This guide is under construction. +Apache DataFusion Java is a thin Java binding over the +[Apache DataFusion](https://datafusion.apache.org/) query engine. SQL and +DataFrame queries execute in native Rust; results return to the JVM as +[Apache Arrow](https://arrow.apache.org/) record batches over the Arrow C +Data Interface. + +This guide covers installation, the `SessionContext` and `DataFrame` APIs, +and Parquet ingestion. + +- [Installation](installation.md) — JDK and Rust prerequisites, building + from source. +- [Quickstart](quickstart.md) — a complete example, walked through. +- [SessionContext](sessioncontext.md) — lifecycle and threading. +- [DataFrame and SQL](dataframe.md) — building and executing queries. +- [Parquet](parquet.md) — registering files and reading them with + `ParquetReadOptions`. +- [Project status](project-status.md) — snapshot of what works today. + +> Early development: no releases yet, API will change. Bug reports and +> contributions welcome. From 86eb8fbb564bd81a68ea03aa268e6ff83995fa2f Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 13 May 2026 07:16:18 -0600 Subject: [PATCH 06/20] docs: add user guide quickstart --- docs/source/user-guide/quickstart.md | 74 ++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 docs/source/user-guide/quickstart.md diff --git a/docs/source/user-guide/quickstart.md b/docs/source/user-guide/quickstart.md new file mode 100644 index 0000000..7d0df38 --- /dev/null +++ b/docs/source/user-guide/quickstart.md @@ -0,0 +1,74 @@ + + +# Quickstart + +This page walks through a complete query end-to-end. + +## The full example + +```java +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.ipc.ArrowReader; +import org.apache.datafusion.DataFrame; +import org.apache.datafusion.SessionContext; + +try (var allocator = new RootAllocator(); + var ctx = new SessionContext()) { + + ctx.registerParquet("orders", "/path/to/orders.parquet"); + + try (DataFrame df = ctx.sql( + "SELECT o_orderpriority, COUNT(*) AS n " + + "FROM orders GROUP BY o_orderpriority"); + ArrowReader reader = df.collect(allocator)) { + while (reader.loadNextBatch()) { + var batch = reader.getVectorSchemaRoot(); + // ... + } + } +} +``` + +## Walkthrough + +**Allocator.** `RootAllocator` is the Arrow off-heap memory allocator. Every +JVM-side Arrow buffer is tracked under an allocator; when the allocator is +closed, leaked buffers are reported. Use one allocator per query (or one +per application) and close it in a `try`-with-resources. + +**Session context.** `SessionContext` is the entry point into DataFusion. It +holds the catalog of registered tables and the query planner. It is +`AutoCloseable` and **not thread-safe** — use one per thread, or guard +access externally. + +**Registering data.** `registerParquet(name, path)` reads the file's footer +on call and exposes it under the given table name. See +[Parquet](parquet.md) for the options form. + +**SQL.** `ctx.sql("...")` plans the query and returns a `DataFrame`. The +query is not executed until results are pulled. + +**Collecting results.** `df.collect(allocator)` starts native execution and +returns an `ArrowReader`. Each `loadNextBatch()` call pulls the next +`VectorSchemaRoot`; iterate until it returns `false`. + +**Cleanup.** Both `SessionContext` and `DataFrame` are `AutoCloseable`. Use +`try`-with-resources so native resources and Arrow buffers are released +even on exception. From b5e0c96e3fb34dcd1d8f103009f0af8cff5d0994 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 13 May 2026 07:17:08 -0600 Subject: [PATCH 07/20] docs: add user guide sessioncontext page --- docs/source/user-guide/sessioncontext.md | 48 ++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 docs/source/user-guide/sessioncontext.md diff --git a/docs/source/user-guide/sessioncontext.md b/docs/source/user-guide/sessioncontext.md new file mode 100644 index 0000000..14111b8 --- /dev/null +++ b/docs/source/user-guide/sessioncontext.md @@ -0,0 +1,48 @@ + + +# SessionContext + +`SessionContext` is the entry point into DataFusion from Java. It owns the +catalog of registered tables and the query planner. + +## Lifecycle + +```java +try (SessionContext ctx = new SessionContext()) { + // register tables, build queries... +} +``` + +`SessionContext` is `AutoCloseable`. Closing it releases the underlying +native context. Use `try`-with-resources so the native side is freed even +on exception. + +## Threading + +A `SessionContext` is **not thread-safe**. Do not share one across threads +without external synchronization. The simplest pattern is one context per +thread. + +## What's configurable today + +Today, `SessionContext` exposes only data-source registration and query +construction. Tuning knobs that DataFusion offers natively +(`SessionConfig`, `RuntimeEnv`) are not yet wired through the Java API. +See [Project status](project-status.md) for the current shape of the API. From abbaf0917a845bec54f621748aa86ce4b391c653 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 13 May 2026 07:17:59 -0600 Subject: [PATCH 08/20] docs: add user guide dataframe page --- docs/source/user-guide/dataframe.md | 90 +++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 docs/source/user-guide/dataframe.md diff --git a/docs/source/user-guide/dataframe.md b/docs/source/user-guide/dataframe.md new file mode 100644 index 0000000..e1ca33d --- /dev/null +++ b/docs/source/user-guide/dataframe.md @@ -0,0 +1,90 @@ + + +# DataFrame and SQL + +DataFusion Java supports two query interfaces: SQL strings via +`SessionContext.sql(String)`, and a programmatic DataFrame API. + +## SQL + +```java +try (DataFrame df = ctx.sql("SELECT a, b FROM t WHERE a > 10")) { + df.show(); +} +``` + +`sql(String)` plans the query and returns a `DataFrame`. Execution does +not start until you pull results. + +## DataFrame transformations + +The DataFrame API exposes `select` and `filter` today. Other +transformations are TBD — see [Project status](project-status.md). + +```java +try (DataFrame df = ctx.readParquet("/path/to/orders.parquet")) { + try (DataFrame filtered = df.filter("o_orderpriority = '1-URGENT'")) { + filtered.show(); + } +} +``` + +Each transformation returns a new `DataFrame` that must be closed. + +## Pulling results + +Three patterns are available: + +**Stream as Arrow batches.** Use `collect(allocator)` to pull the result +set as Arrow record batches via the [Arrow C Data Interface]: + +```java +try (DataFrame df = ctx.sql("SELECT ..."); + ArrowReader reader = df.collect(allocator)) { + while (reader.loadNextBatch()) { + var batch = reader.getVectorSchemaRoot(); + // process batch... + } +} +``` + +[Arrow C Data Interface]: https://arrow.apache.org/docs/format/CDataInterface.html + +**Count rows.** `df.count()` returns the row count without materializing +the rows in the JVM. + +**Print for inspection.** `df.show()` and `df.show(int n)` print results +to standard output. Useful for exploration; not appropriate for +production code paths. + +## Schema introspection + +To get the schema of a registered table without running a query: + +```java +Schema schema = ctx.tableSchema("orders"); +``` + +## Plan input + +A DataFusion logical plan can be deserialized from `datafusion-proto` +bytes via `SessionContext.fromProto(byte[])`. The `datafusion-proto` Java +classes are generated by the Maven build. This is useful for accepting +plans produced by other DataFusion-aware tooling. From b7b692f48507e49949a2bd88687bd66420e7bd5c Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 13 May 2026 07:18:55 -0600 Subject: [PATCH 09/20] docs: add user guide parquet page --- docs/source/user-guide/parquet.md | 69 +++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 docs/source/user-guide/parquet.md diff --git a/docs/source/user-guide/parquet.md b/docs/source/user-guide/parquet.md new file mode 100644 index 0000000..7febbdd --- /dev/null +++ b/docs/source/user-guide/parquet.md @@ -0,0 +1,69 @@ + + +# Parquet + +DataFusion Java reads Parquet through two entry points on `SessionContext`: +`registerParquet` to expose a file as a named table, and `readParquet` to +get a `DataFrame` directly. + +## Register a table + +```java +ctx.registerParquet("orders", "/path/to/orders.parquet"); + +try (DataFrame df = ctx.sql("SELECT * FROM orders LIMIT 10")) { + df.show(); +} +``` + +The file's footer is read at registration time. The table remains in the +catalog for the lifetime of the `SessionContext`. + +## Read a DataFrame directly + +```java +try (DataFrame df = ctx.readParquet("/path/to/orders.parquet")) { + df.show(); +} +``` + +`readParquet` skips the catalog and hands back a `DataFrame` straight +away. + +## ParquetReadOptions + +Both entry points accept a `ParquetReadOptions` to tune the underlying +read. Construct one with the builder: + +```java +ParquetReadOptions opts = ParquetReadOptions.builder() + .fileExtension(".parquet") + .build(); + +ctx.registerParquet("orders", "/path/to/orders.parquet", opts); +// or +try (DataFrame df = ctx.readParquet("/path/to/orders.parquet", opts)) { + df.show(); +} +``` + +The supported options track what DataFusion exposes on its Rust +`ParquetReadOptions` builder. Inspect the class on the Java side for the +exact setters available in the version you are using. From f2767008cea3164f923e530451bb13975a9e6d44 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 13 May 2026 07:19:45 -0600 Subject: [PATCH 10/20] docs: add user guide project status page --- docs/source/user-guide/project-status.md | 48 ++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 docs/source/user-guide/project-status.md diff --git a/docs/source/user-guide/project-status.md b/docs/source/user-guide/project-status.md new file mode 100644 index 0000000..aacb19f --- /dev/null +++ b/docs/source/user-guide/project-status.md @@ -0,0 +1,48 @@ + + +# Project status + +A snapshot of what works today. The library is in early development; the +API will change before the first release. + +## Query interfaces + +- [x] SQL: `SessionContext.sql(String)` +- [x] DataFrame: `select`, `filter` (other transformations TBD) +- [x] DataFusion-Proto `LogicalPlanNode`: `SessionContext.fromProto(byte[])`. + The `datafusion-proto` Java classes are generated by the build. + +## Data sources + +- [x] Parquet via `registerParquet` / `readParquet`, with `ParquetReadOptions` +- [ ] CSV, JSON, Avro +- [ ] Custom catalog and table providers + +## Results + +- [x] `DataFrame.collect(allocator)` — Arrow C Data Interface stream +- [x] `DataFrame.count()`, `show()`, `show(int)` +- [x] `SessionContext.tableSchema(String)` + +## Not yet + +- [ ] `SessionConfig` / `RuntimeEnv` knobs +- [ ] Java UDFs +- [ ] `write_*` outputs From 4552569217f18e612c5df0292bee1a37c18ecd88 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 13 May 2026 07:20:48 -0600 Subject: [PATCH 11/20] docs: write contributor guide landing page --- docs/source/contributor-guide/index.md | 28 +++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/docs/source/contributor-guide/index.md b/docs/source/contributor-guide/index.md index bac4c60..8afa315 100644 --- a/docs/source/contributor-guide/index.md +++ b/docs/source/contributor-guide/index.md @@ -19,4 +19,30 @@ under the License. # Contributor Guide -This guide is under construction. +Bug reports, design discussion, and patches are welcome. This project follows +the Apache DataFusion contribution model. + +## Filing issues and discussing changes + +- File bugs and feature requests on + [GitHub issues](https://github.com/apache/datafusion-java/issues). +- For larger or design-level discussion, the mailing list is + [dev@datafusion.apache.org](mailto:dev@datafusion.apache.org). +- Please open an issue before sending a PR for any significant change so + the approach can be agreed on first. + +## Development workflow + +Branch from `main`, write changes with +[conventional commit](https://www.conventionalcommits.org/) messages in +the imperative mood (e.g. `feat: add foo`, `fix(native): handle bar`), +and open a pull request targeting `main`. + +## Topics + +- [Development](development.md) — build prerequisites, running tests, + TPC-H test data, repo layout. +- [Code style](code-style.md) — formatters and license headers. +- [Releasing](releasing.md) — Apache release process (placeholder). +- [Updating DataFusion / protobuf version](updating-datafusion-version.md) — + step-by-step recipe. From 056d4876ae645695b4dfec229340affa4758ecfe Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 13 May 2026 07:21:45 -0600 Subject: [PATCH 12/20] docs: add contributor guide development page --- docs/source/contributor-guide/development.md | 74 ++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 docs/source/contributor-guide/development.md diff --git a/docs/source/contributor-guide/development.md b/docs/source/contributor-guide/development.md new file mode 100644 index 0000000..e98b87d --- /dev/null +++ b/docs/source/contributor-guide/development.md @@ -0,0 +1,74 @@ + + +# Development + +## Build prerequisites + +- JDK 17 or newer. +- Rust toolchain (stable, installed via [rustup]). +- [`tpchgen-cli`] — only needed to generate test data for the Parquet + integration test (`cargo install tpchgen-cli`). + +Maven is bundled via the `./mvnw` wrapper; no separate Maven install is +required. + +[rustup]: https://rustup.rs/ +[`tpchgen-cli`]: https://github.com/clflushopt/tpchgen-rs + +## Build and test + +```sh +make test +``` + +This builds the native Rust crate and runs the JUnit tests. The steps can +be run individually: + +```sh +cd native && cargo build +./mvnw test +``` + +The native library must be built before running JVM tests. + +The first build in a fresh checkout reaches out to +`raw.githubusercontent.com` to fetch the DataFusion `.proto` files used +to generate the `datafusion-proto` Java classes. Subsequent builds are +offline; the `download-maven-plugin` cache under +`~/.m2/repository/.cache/` satisfies them. + +## Test data + +The Parquet integration test reads TPC-H SF1 data (~345 MB across 8 tables +in Snappy-compressed Parquet). Generate it once with: + +```sh +make tpch-data +``` + +Tests that need this data skip cleanly if it is missing. `make clean` +does **not** remove `tpch-data/` — delete it manually to reclaim the +disk space. + +## Repository layout + +- `src/` — Java sources and tests. +- `native/` — Rust crate (JNI + Arrow C Data Interface). +- `docs/` — Sphinx documentation source and build scripts. From a947eab03850bde8ba8e9cd49f83bfa4cd8d7793 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 13 May 2026 07:22:29 -0600 Subject: [PATCH 13/20] docs: add contributor guide code style page --- docs/source/contributor-guide/code-style.md | 46 +++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 docs/source/contributor-guide/code-style.md diff --git a/docs/source/contributor-guide/code-style.md b/docs/source/contributor-guide/code-style.md new file mode 100644 index 0000000..3c6020c --- /dev/null +++ b/docs/source/contributor-guide/code-style.md @@ -0,0 +1,46 @@ + + +# Code style + +## Java + +Run the Spotless formatter before committing. CI fails the build if +formatting drifts: + +```sh +./mvnw spotless:apply +``` + +## Rust + +Run inside `native/`: + +```sh +cargo fmt +cargo clippy --all-targets -- -D warnings +``` + +`-D warnings` turns clippy warnings into build failures, matching CI. + +## License headers + +New source files need the Apache 2.0 license header. Apache RAT enforces +this during `verify` — `./mvnw verify` will fail if a tracked file is +missing the header. From b3e7de83a8aaca6b3956111850a16df2a16f522d Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 13 May 2026 07:23:17 -0600 Subject: [PATCH 14/20] docs: add contributor guide releasing placeholder --- docs/source/contributor-guide/releasing.md | 28 ++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 docs/source/contributor-guide/releasing.md diff --git a/docs/source/contributor-guide/releasing.md b/docs/source/contributor-guide/releasing.md new file mode 100644 index 0000000..8210d2b --- /dev/null +++ b/docs/source/contributor-guide/releasing.md @@ -0,0 +1,28 @@ + + +# Releasing + +There are no releases of Apache DataFusion Java yet. Once the first +release approaches, this page will document the Apache release process +the project follows. + +In the meantime, refer to the +[Apache DataFusion release process](https://datafusion.apache.org/contributor-guide/release/index.html) +for the broader pattern used by sibling subprojects. From 0c05dc7c9c9215e5f8ba762193e229f78d267a5e Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 13 May 2026 07:24:08 -0600 Subject: [PATCH 15/20] docs: add contributor guide datafusion bump recipe --- .../updating-datafusion-version.md | 62 +++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 docs/source/contributor-guide/updating-datafusion-version.md diff --git a/docs/source/contributor-guide/updating-datafusion-version.md b/docs/source/contributor-guide/updating-datafusion-version.md new file mode 100644 index 0000000..56d50dc --- /dev/null +++ b/docs/source/contributor-guide/updating-datafusion-version.md @@ -0,0 +1,62 @@ + + +# Updating the DataFusion / protobuf schema version + +Three things must move together when bumping DataFusion: + +1. `native/Cargo.toml` — the `datafusion` crate dependency. +2. `pom.xml` — the `` Maven property. **Must equal + the Cargo version**; a mismatch means JVM-built protobuf plans won't + deserialize on the native side. +3. `pom.xml` — the `` checksums on the two `download-maven-plugin` + executions. These pin the downloaded `.proto` files; the build fails + if upstream silently re-tags them, which is the desired behavior. + +## Recipe + +```sh +# 1. Bump the Cargo dep +$EDITOR native/Cargo.toml # set datafusion = "" +(cd native && cargo update -p datafusion) + +# 2. Bump the Maven property to match +$EDITOR pom.xml # set + +# 3. Compute the new SHA-512 hashes for both `.proto` files from the +# upstream tag you just set in step 2, then paste them into the two +# elements in pom.xml. +NEW=$(grep -m1 -oE '[^<]+' pom.xml | cut -d'>' -f2) +curl -sL "https://raw.githubusercontent.com/apache/datafusion/$NEW/datafusion/proto-common/proto/datafusion_common.proto" | shasum -a 512 | awk '{print $1}' +curl -sL "https://raw.githubusercontent.com/apache/datafusion/$NEW/datafusion/proto/proto/datafusion.proto" | shasum -a 512 | awk '{print $1}' +$EDITOR pom.xml # paste the two hashes into the elements + +# Drop the local download cache so the next build re-downloads against +# the new hashes. +rm -rf ~/.m2/repository/.cache/download-maven-plugin target/proto + +# 4. Verify +make && make test +``` + +## Why the protobuf runtime version is separate + +The protobuf runtime version (`` in `pom.xml`) tracks +the Java ecosystem (security and JDK compatibility), not DataFusion. +Bump it independently when there is a reason. From 7cf3531dd96e0350cf0c72cb3ae650d693549f98 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 13 May 2026 07:25:48 -0600 Subject: [PATCH 16/20] docs: trim README and link to docs site --- README.md | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 62 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 9635952..8085e8d 100644 --- a/README.md +++ b/README.md @@ -1 +1,62 @@ -# DataFusion Java Bindings +# Apache DataFusion Java + +Java bindings for [Apache DataFusion]. Queries run in native Rust and results +return to the JVM as [Apache Arrow] batches via the Arrow C Data Interface. + +[Apache DataFusion]: https://datafusion.apache.org/ +[Apache Arrow]: https://arrow.apache.org/ + +> Early development: no releases yet, API will change. Bug reports and +> contributions welcome. + +## Quickstart + +```java +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.ipc.ArrowReader; +import org.apache.datafusion.DataFrame; +import org.apache.datafusion.SessionContext; + +try (var allocator = new RootAllocator(); + var ctx = new SessionContext()) { + + ctx.registerParquet("orders", "/path/to/orders.parquet"); + + try (DataFrame df = ctx.sql( + "SELECT o_orderpriority, COUNT(*) AS n " + + "FROM orders GROUP BY o_orderpriority"); + ArrowReader reader = df.collect(allocator)) { + while (reader.loadNextBatch()) { + var batch = reader.getVectorSchemaRoot(); + // ... + } + } +} +``` + +`SessionContext` and `DataFrame` are `AutoCloseable` and not thread-safe. + +## Documentation + +The full documentation lives under [`docs/source/`](docs/source/index.md) +and is built with Sphinx (see [`docs/README.md`](docs/README.md) for the +build steps): + +- [User guide](docs/source/user-guide/index.md) — installation, the + DataFrame and SQL APIs, Parquet ingestion, project status. +- [Contributor guide](docs/source/contributor-guide/index.md) — build, + test, code style, and how to bump the DataFusion version. + +## Requirements + +JDK 17+. Building from source: see +[`docs/source/contributor-guide/development.md`](docs/source/contributor-guide/development.md). + +## Contributing + +Open an issue to discuss non-trivial changes before sending a PR. See the +[contributor guide](docs/source/contributor-guide/index.md). + +## License + +Apache License 2.0. See [LICENSE.txt](LICENSE.txt) and [NOTICE.txt](NOTICE.txt). From 62216f800ca7e67286417cd024b68e1bbd3f80e9 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 13 May 2026 07:28:09 -0600 Subject: [PATCH 17/20] docs: trim CONTRIBUTING and link to docs site --- CONTRIBUTING.md | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 CONTRIBUTING.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..48688d8 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,33 @@ + + +# Contributing to Apache DataFusion Java + +Bug reports, design discussion, and patches are welcome. This project follows +the Apache DataFusion contribution model. + +- File bugs and feature requests on + [GitHub issues](https://github.com/apache/datafusion-java/issues). +- For larger or design-level discussion, the mailing list is + [dev@datafusion.apache.org](mailto:dev@datafusion.apache.org). +- Please open an issue before sending a PR for any significant change so + the approach can be agreed on first. + +For build, test, code style, and version-bump workflows, see the +[contributor guide](docs/source/contributor-guide/index.md). From 9cb7dc35424124f3489e7030f51ada69cba8cc44 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 13 May 2026 07:33:30 -0600 Subject: [PATCH 18/20] docs: fix incorrect ParquetReadOptions API and tighten development page --- docs/source/contributor-guide/development.md | 3 +++ docs/source/user-guide/dataframe.md | 2 +- docs/source/user-guide/parquet.md | 9 ++++----- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/docs/source/contributor-guide/development.md b/docs/source/contributor-guide/development.md index e98b87d..c6818f6 100644 --- a/docs/source/contributor-guide/development.md +++ b/docs/source/contributor-guide/development.md @@ -69,6 +69,9 @@ disk space. ## Repository layout +- `pom.xml` — Maven build descriptor. +- `Makefile` — top-level build orchestration (`make test`, `make tpch-data`). +- `mvnw`, `mvnw.cmd` — bundled Maven wrapper. - `src/` — Java sources and tests. - `native/` — Rust crate (JNI + Arrow C Data Interface). - `docs/` — Sphinx documentation source and build scripts. diff --git a/docs/source/user-guide/dataframe.md b/docs/source/user-guide/dataframe.md index e1ca33d..e91eab7 100644 --- a/docs/source/user-guide/dataframe.md +++ b/docs/source/user-guide/dataframe.md @@ -79,7 +79,7 @@ production code paths. To get the schema of a registered table without running a query: ```java -Schema schema = ctx.tableSchema("orders"); +org.apache.arrow.vector.types.pojo.Schema schema = ctx.tableSchema("orders"); ``` ## Plan input diff --git a/docs/source/user-guide/parquet.md b/docs/source/user-guide/parquet.md index 7febbdd..73b74c8 100644 --- a/docs/source/user-guide/parquet.md +++ b/docs/source/user-guide/parquet.md @@ -50,12 +50,11 @@ away. ## ParquetReadOptions Both entry points accept a `ParquetReadOptions` to tune the underlying -read. Construct one with the builder: +read. Construct one directly and chain setters: ```java -ParquetReadOptions opts = ParquetReadOptions.builder() - .fileExtension(".parquet") - .build(); +ParquetReadOptions opts = new ParquetReadOptions() + .fileExtension(".parquet"); ctx.registerParquet("orders", "/path/to/orders.parquet", opts); // or @@ -64,6 +63,6 @@ try (DataFrame df = ctx.readParquet("/path/to/orders.parquet", opts)) { } ``` -The supported options track what DataFusion exposes on its Rust +The supported setters track what DataFusion exposes on its Rust `ParquetReadOptions` builder. Inspect the class on the Java side for the exact setters available in the version you are using. From a691900b6a535e87ab60821ce3fdb37447f3aac4 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 13 May 2026 07:39:58 -0600 Subject: [PATCH 19/20] docs: nest toctrees in section index pages for sidebar nav --- docs/source/contributor-guide/index.md | 16 ++++++++-------- docs/source/index.md | 25 ++++--------------------- docs/source/user-guide/index.md | 18 ++++++++++-------- 3 files changed, 22 insertions(+), 37 deletions(-) diff --git a/docs/source/contributor-guide/index.md b/docs/source/contributor-guide/index.md index 8afa315..de7f22a 100644 --- a/docs/source/contributor-guide/index.md +++ b/docs/source/contributor-guide/index.md @@ -38,11 +38,11 @@ Branch from `main`, write changes with the imperative mood (e.g. `feat: add foo`, `fix(native): handle bar`), and open a pull request targeting `main`. -## Topics - -- [Development](development.md) — build prerequisites, running tests, - TPC-H test data, repo layout. -- [Code style](code-style.md) — formatters and license headers. -- [Releasing](releasing.md) — Apache release process (placeholder). -- [Updating DataFusion / protobuf version](updating-datafusion-version.md) — - step-by-step recipe. +```{toctree} +:maxdepth: 1 + +development +code-style +releasing +updating-datafusion-version +``` diff --git a/docs/source/index.md b/docs/source/index.md index 675021e..0ee9519 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -69,27 +69,10 @@ Code of Conduct +Contributor Guide ``` diff --git a/docs/source/user-guide/index.md b/docs/source/user-guide/index.md index d5cbeb9..13728a5 100644 --- a/docs/source/user-guide/index.md +++ b/docs/source/user-guide/index.md @@ -28,14 +28,16 @@ Data Interface. This guide covers installation, the `SessionContext` and `DataFrame` APIs, and Parquet ingestion. -- [Installation](installation.md) — JDK and Rust prerequisites, building - from source. -- [Quickstart](quickstart.md) — a complete example, walked through. -- [SessionContext](sessioncontext.md) — lifecycle and threading. -- [DataFrame and SQL](dataframe.md) — building and executing queries. -- [Parquet](parquet.md) — registering files and reading them with - `ParquetReadOptions`. -- [Project status](project-status.md) — snapshot of what works today. +```{toctree} +:maxdepth: 1 + +installation +quickstart +sessioncontext +dataframe +parquet +project-status +``` > Early development: no releases yet, API will change. Bug reports and > contributions welcome. From c8018e48ff560cfe5cd6925a5cb5b1db5823aad2 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 13 May 2026 07:48:05 -0600 Subject: [PATCH 20/20] docs: add user guide page on building plans via datafusion-proto --- docs/source/user-guide/index.md | 1 + docs/source/user-guide/proto-plans.md | 201 ++++++++++++++++++++++++++ 2 files changed, 202 insertions(+) create mode 100644 docs/source/user-guide/proto-plans.md diff --git a/docs/source/user-guide/index.md b/docs/source/user-guide/index.md index 13728a5..289eafa 100644 --- a/docs/source/user-guide/index.md +++ b/docs/source/user-guide/index.md @@ -36,6 +36,7 @@ quickstart sessioncontext dataframe parquet +proto-plans project-status ``` diff --git a/docs/source/user-guide/proto-plans.md b/docs/source/user-guide/proto-plans.md new file mode 100644 index 0000000..513018b --- /dev/null +++ b/docs/source/user-guide/proto-plans.md @@ -0,0 +1,201 @@ + + +# Logical plans via datafusion-proto + +`SessionContext.fromProto(byte[])` accepts a serialized DataFusion +`LogicalPlanNode` and returns a lazy `DataFrame`. This is useful when you +already have a plan produced by another DataFusion-aware tool, or when +you want to construct the plan programmatically with finer-grained +control than the `sql` or DataFrame APIs. + +The protobuf Java classes are generated by the build into the +`org.apache.datafusion.protobuf` (plan and expression nodes) and +`datafusion_common` (scalar values, schema, column references, file +formats) packages. The Maven build downloads pinned `.proto` files from +the matching upstream DataFusion tag on first build, then generates the +Java classes locally — see the +[Contributor Guide](../contributor-guide/updating-datafusion-version.md) +for how to bump the version. + +## A minimal plan + +The smallest interesting plan is a projection of a literal over an +empty input. It is useful as a sanity check and exercises serialization +end-to-end without touching any storage. + +```java +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.ipc.ArrowReader; +import org.apache.datafusion.DataFrame; +import org.apache.datafusion.SessionContext; +import org.apache.datafusion.protobuf.EmptyRelationNode; +import org.apache.datafusion.protobuf.LogicalExprNode; +import org.apache.datafusion.protobuf.LogicalPlanNode; +import org.apache.datafusion.protobuf.ProjectionNode; + +import datafusion_common.DatafusionCommon; + +LogicalPlanNode plan = + LogicalPlanNode.newBuilder() + .setProjection( + ProjectionNode.newBuilder() + .setInput( + LogicalPlanNode.newBuilder() + .setEmptyRelation( + EmptyRelationNode.newBuilder().setProduceOneRow(true).build()) + .build()) + .addExpr( + LogicalExprNode.newBuilder() + .setLiteral( + DatafusionCommon.ScalarValue.newBuilder().setInt32Value(1).build()) + .build()) + .build()) + .build(); + +try (var allocator = new RootAllocator(); + SessionContext ctx = new SessionContext(); + DataFrame df = ctx.fromProto(plan.toByteArray()); + ArrowReader reader = df.collect(allocator)) { + reader.loadNextBatch(); + VectorSchemaRoot batch = reader.getVectorSchemaRoot(); + IntVector col = (IntVector) batch.getVector(0); + System.out.println(col.get(0)); // 1 +} +``` + +`fromProto` performs the same logical-planning, optimization, and +physical-planning pipeline as `sql`; the result is a lazy +[`DataFrame`](dataframe.md) that only executes when you pull results. + +## Scanning a Parquet file via ListingTableScanNode + +A `ListingTableScanNode` reads one or more files of the same format +from disk. Unlike `registerParquet`, it does not require the table to +be in the catalog — the scan node carries everything DataFusion needs: +the file paths, the schema, the projection, the file format, and the +target partition count. + +The scan node's `schema` field is a `datafusion_common.Schema`, not an +Arrow `Schema`. Convert between the two with the helper in +`org.apache.datafusion.proto.SchemaConverter`: + +```java +import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.datafusion.proto.SchemaConverter; + +Schema arrow = ctx.tableSchema("lineitem"); +DatafusionCommon.Schema schemaProto = SchemaConverter.toProto(arrow); +``` + +The full example: register the file once to introspect its schema, +then build a plan that scans the same file, sorts by `l_orderkey`, +and fetches the first row. This is equivalent to +`SELECT l_orderkey FROM lineitem ORDER BY l_orderkey LIMIT 1`. + +```java +import org.apache.datafusion.protobuf.BareTableReference; +import org.apache.datafusion.protobuf.ListingTableScanNode; +import org.apache.datafusion.protobuf.ProjectionColumns; +import org.apache.datafusion.protobuf.SortExprNode; +import org.apache.datafusion.protobuf.SortNode; +import org.apache.datafusion.protobuf.TableReference; + +String path = "/path/to/lineitem.parquet"; + +try (var allocator = new RootAllocator(); + SessionContext ctx = new SessionContext()) { + + ctx.registerParquet("lineitem", path); + DatafusionCommon.Schema schemaProto = + SchemaConverter.toProto(ctx.tableSchema("lineitem")); + + LogicalExprNode orderKeyCol = + LogicalExprNode.newBuilder() + .setColumn(DatafusionCommon.Column.newBuilder().setName("l_orderkey").build()) + .build(); + + LogicalPlanNode plan = + LogicalPlanNode.newBuilder() + .setSort( + SortNode.newBuilder() + .setInput( + LogicalPlanNode.newBuilder() + .setListingScan( + ListingTableScanNode.newBuilder() + .setTableName( + TableReference.newBuilder() + .setBare( + BareTableReference.newBuilder() + .setTable("lineitem") + .build()) + .build()) + .addPaths(path) + .setFileExtension(".parquet") + .setSchema(schemaProto) + .setProjection( + ProjectionColumns.newBuilder() + .addColumns("l_orderkey") + .build()) + .setParquet( + DatafusionCommon.ParquetFormat.getDefaultInstance()) + .setTargetPartitions(1) + .build()) + .build()) + .addExpr( + SortExprNode.newBuilder() + .setExpr(orderKeyCol) + .setAsc(true) + .setNullsFirst(false) + .build()) + .setFetch(1) + .build()) + .build(); + + try (DataFrame df = ctx.fromProto(plan.toByteArray()); + ArrowReader reader = df.collect(allocator)) { + reader.loadNextBatch(); + // ... + } +} +``` + +## When to use proto plans + +The `sql` and DataFrame APIs are the right choice for most workloads. +Reach for `fromProto` when you need one of: + +- **Cross-tool interop.** Accept plans produced by another + DataFusion-based system (a planner, a scheduler, a query frontend). +- **Programmatic plan construction.** Build the plan node tree directly + instead of going through SQL parsing, useful for tools that compile + their own surface language to DataFusion. +- **Plan persistence.** Serialize a plan to bytes, store or transmit + it, and execute it later — possibly in a different process or on a + different machine. + +## Schema conversion support + +`SchemaConverter.toProto` and `SchemaConverter.fromProto` support the +primitive Arrow types this project's tests exercise: `Bool`, signed and +unsigned integer types 8 through 64 bits, `Float32`, `Float64`, `Utf8`, +`Utf8View`, `LargeUtf8`, `Date32`, and `Decimal128`. Anything else +raises `UnsupportedOperationException` naming the offending type.