From faf447b7abdd10b2738cd58a13eecd265ba5a413 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sat, 30 Aug 2025 09:19:50 -0400 Subject: [PATCH 1/8] Set fail on warning for documentation generation --- docs/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/Makefile b/docs/Makefile index e65c8e250..49ebae372 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -35,4 +35,4 @@ help: # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) --fail-on-warning \ No newline at end of file From 1a8a7a44ee1fa6b362f2cfec4c486e5bd41feea9 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sat, 30 Aug 2025 09:33:42 -0400 Subject: [PATCH 2/8] Avoid building the wheel if possible during documentation generation --- .github/workflows/docs.yaml | 87 +++++++++++++++++++++++++++++++------ 1 file changed, 73 insertions(+), 14 deletions(-) diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index c24fa5ade..724fbe64a 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -7,19 +7,25 @@ on: pull_request: branches: - main + # Run after build workflow completes so we can get the built artifact + workflow_run: + workflows: ["Python Release Build"] + types: + - completed name: Deploy DataFusion Python site jobs: - debug-github-context: + debug-github-context: name: Print github context runs-on: ubuntu-latest steps: - - name: Dump GitHub context - env: - GITHUB_CONTEXT: ${{ toJson(github) }} - run: | - echo "$GITHUB_CONTEXT" + - name: Dump GitHub context + env: + GITHUB_CONTEXT: ${{ toJson(github) }} + run: | + echo "$GITHUB_CONTEXT" + build-docs: name: Build docs runs-on: ubuntu-latest @@ -37,8 +43,10 @@ jobs: echo "Unsupported input: ${{ github.ref }} / ${{ github.ref_type }}" exit 1 fi + - name: Checkout docs sources uses: actions/checkout@v5 + - name: Checkout docs target branch if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || github.ref_type == 'tag') uses: actions/checkout@v5 @@ -46,24 +54,75 @@ jobs: fetch-depth: 0 ref: ${{ steps.target-branch.outputs.value }} path: docs-target + - name: Setup Python uses: actions/setup-python@v5 with: python-version: "3.11" - - name: Install Protoc - uses: arduino/setup-protoc@v3 + - name: Install dependencies + uses: astral-sh/setup-uv@v6 with: - version: '27.4' - repo-token: ${{ secrets.GITHUB_TOKEN }} + enable-cache: true - - name: Install dependencies and build - uses: astral-sh/setup-uv@v6 + # Try to download pre-built wheel from the build workflow + - name: Download wheel from build workflow + id: download-wheel + continue-on-error: true + uses: actions/download-artifact@v5 with: - enable-cache: true + name: dist + path: wheels/ + # For workflow_run events, get artifacts from the triggering workflow + run-id: ${{ github.event.workflow_run.id || github.run_id }} - - name: Build repo + # Check if we have a compatible wheel + - name: Check for compatible wheel + id: check-wheel run: | + set -x + if [ -d "wheels/" ] && [ "$(ls -A wheels/)" ]; then + echo "Available wheels:" + ls -la wheels/ + + # Find a compatible wheel for Linux x86_64 (the docs runner) + WHEEL=$(find wheels/ -name "*linux_x86_64*.whl" -o -name "*manylinux*x86_64*.whl" | head -1) + if [ -n "$WHEEL" ]; then + echo "Found compatible wheel: $WHEEL" + echo "wheel-found=true" >> "$GITHUB_OUTPUT" + echo "wheel-path=$WHEEL" >> "$GITHUB_OUTPUT" + else + echo "No compatible wheel found for Linux x86_64" + echo "wheel-found=false" >> "$GITHUB_OUTPUT" + fi + else + echo "No wheels directory or wheels found" + echo "wheel-found=false" >> "$GITHUB_OUTPUT" + fi + + # Install from pre-built wheel if available + - name: Install from pre-built wheel + if: steps.check-wheel.outputs.wheel-found == 'true' + run: | + set -x + uv venv + # Install documentation dependencies + uv sync --dev --no-install-package datafusion --group docs + # Install the pre-built wheel + uv pip install "${{ steps.check-wheel.outputs.wheel-path }}" + echo "Installed datafusion from pre-built wheel" + + # Fallback: Build from source if no wheel is available + - name: Build from source (fallback) + if: steps.check-wheel.outputs.wheel-found != 'true' + run: | + set -x + echo "No compatible pre-built wheel found, building from source" + + # Install Protoc for building from source + sudo apt-get update + sudo apt-get install -y protobuf-compiler + uv venv uv sync --dev --no-install-package datafusion --group docs uv run --no-project maturin develop --uv From 31d5cf8e7af0b252416ec894925cfdb5085e7fef Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sat, 30 Aug 2025 09:48:04 -0400 Subject: [PATCH 3/8] Revert "Avoid building the wheel if possible during documentation generation" This reverts commit 1a8a7a44ee1fa6b362f2cfec4c486e5bd41feea9. --- .github/workflows/docs.yaml | 87 ++++++------------------------------- 1 file changed, 14 insertions(+), 73 deletions(-) diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index 724fbe64a..c24fa5ade 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -7,25 +7,19 @@ on: pull_request: branches: - main - # Run after build workflow completes so we can get the built artifact - workflow_run: - workflows: ["Python Release Build"] - types: - - completed name: Deploy DataFusion Python site jobs: - debug-github-context: + debug-github-context: name: Print github context runs-on: ubuntu-latest steps: - - name: Dump GitHub context - env: - GITHUB_CONTEXT: ${{ toJson(github) }} - run: | - echo "$GITHUB_CONTEXT" - + - name: Dump GitHub context + env: + GITHUB_CONTEXT: ${{ toJson(github) }} + run: | + echo "$GITHUB_CONTEXT" build-docs: name: Build docs runs-on: ubuntu-latest @@ -43,10 +37,8 @@ jobs: echo "Unsupported input: ${{ github.ref }} / ${{ github.ref_type }}" exit 1 fi - - name: Checkout docs sources uses: actions/checkout@v5 - - name: Checkout docs target branch if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || github.ref_type == 'tag') uses: actions/checkout@v5 @@ -54,75 +46,24 @@ jobs: fetch-depth: 0 ref: ${{ steps.target-branch.outputs.value }} path: docs-target - - name: Setup Python uses: actions/setup-python@v5 with: python-version: "3.11" - - name: Install dependencies - uses: astral-sh/setup-uv@v6 + - name: Install Protoc + uses: arduino/setup-protoc@v3 with: - enable-cache: true + version: '27.4' + repo-token: ${{ secrets.GITHUB_TOKEN }} - # Try to download pre-built wheel from the build workflow - - name: Download wheel from build workflow - id: download-wheel - continue-on-error: true - uses: actions/download-artifact@v5 + - name: Install dependencies and build + uses: astral-sh/setup-uv@v6 with: - name: dist - path: wheels/ - # For workflow_run events, get artifacts from the triggering workflow - run-id: ${{ github.event.workflow_run.id || github.run_id }} + enable-cache: true - # Check if we have a compatible wheel - - name: Check for compatible wheel - id: check-wheel + - name: Build repo run: | - set -x - if [ -d "wheels/" ] && [ "$(ls -A wheels/)" ]; then - echo "Available wheels:" - ls -la wheels/ - - # Find a compatible wheel for Linux x86_64 (the docs runner) - WHEEL=$(find wheels/ -name "*linux_x86_64*.whl" -o -name "*manylinux*x86_64*.whl" | head -1) - if [ -n "$WHEEL" ]; then - echo "Found compatible wheel: $WHEEL" - echo "wheel-found=true" >> "$GITHUB_OUTPUT" - echo "wheel-path=$WHEEL" >> "$GITHUB_OUTPUT" - else - echo "No compatible wheel found for Linux x86_64" - echo "wheel-found=false" >> "$GITHUB_OUTPUT" - fi - else - echo "No wheels directory or wheels found" - echo "wheel-found=false" >> "$GITHUB_OUTPUT" - fi - - # Install from pre-built wheel if available - - name: Install from pre-built wheel - if: steps.check-wheel.outputs.wheel-found == 'true' - run: | - set -x - uv venv - # Install documentation dependencies - uv sync --dev --no-install-package datafusion --group docs - # Install the pre-built wheel - uv pip install "${{ steps.check-wheel.outputs.wheel-path }}" - echo "Installed datafusion from pre-built wheel" - - # Fallback: Build from source if no wheel is available - - name: Build from source (fallback) - if: steps.check-wheel.outputs.wheel-found != 'true' - run: | - set -x - echo "No compatible pre-built wheel found, building from source" - - # Install Protoc for building from source - sudo apt-get update - sudo apt-get install -y protobuf-compiler - uv venv uv sync --dev --no-install-package datafusion --group docs uv run --no-project maturin develop --uv From b7c1eedad9bfc1a823f0bf49a0939b6a16577a8c Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sat, 30 Aug 2025 09:49:25 -0400 Subject: [PATCH 4/8] Move documentation into build workflow so that it is guaranteed to run after wheel build --- .github/workflows/build.yml | 95 ++++++++++++++++++++++++++++++++++++- 1 file changed, 94 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index bc770bbc9..97ece3d42 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -271,7 +271,100 @@ jobs: with: name: dist pattern: dist-* - + + # Documentation build job that runs after wheels are built + build-docs: + name: Build docs + runs-on: ubuntu-latest + needs: [build-manylinux-x86_64] # Only need the Linux wheel for docs + # Only run docs on main branch pushes, tags, or PRs + if: github.event_name == 'push' || github.event_name == 'pull_request' + steps: + - name: Set target branch + if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || github.ref_type == 'tag') + id: target-branch + run: | + set -x + if test '${{ github.ref }}' = 'refs/heads/main'; then + echo "value=asf-staging" >> "$GITHUB_OUTPUT" + elif test '${{ github.ref_type }}' = 'tag'; then + echo "value=asf-site" >> "$GITHUB_OUTPUT" + else + echo "Unsupported input: ${{ github.ref }} / ${{ github.ref_type }}" + exit 1 + fi + + - name: Checkout docs sources + uses: actions/checkout@v5 + + - name: Checkout docs target branch + if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || github.ref_type == 'tag') + uses: actions/checkout@v5 + with: + fetch-depth: 0 + ref: ${{ steps.target-branch.outputs.value }} + path: docs-target + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install dependencies + uses: astral-sh/setup-uv@v6 + with: + enable-cache: true + + # Download the Linux wheel built in the previous job + - name: Download pre-built Linux wheel + uses: actions/download-artifact@v5 + with: + name: dist-manylinux-x86_64 + path: wheels/ + + # Install from the pre-built wheel + - name: Install from pre-built wheel + run: | + set -x + uv venv + # Install documentation dependencies + uv sync --dev --no-install-package datafusion --group docs + # Install the pre-built wheel + WHEEL=$(find wheels/ -name "*.whl" | head -1) + if [ -n "$WHEEL" ]; then + echo "Installing wheel: $WHEEL" + uv pip install "$WHEEL" + else + echo "ERROR: No wheel found!" + exit 1 + fi + + - name: Build docs + run: | + set -x + cd docs + curl -O https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv + curl -O https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet + uv run --no-project make html + + - name: Copy & push the generated HTML + if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || github.ref_type == 'tag') + run: | + set -x + cd docs-target + # delete anything but: 1) '.'; 2) '..'; 3) .git/ + find ./ | grep -vE "^./$|^../$|^./.git" | xargs rm -rf + cp ../.asf.yaml . + cp -r ../docs/build/html/* . + git status --porcelain + if [ "$(git status --porcelain)" != "" ]; then + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git add --all + git commit -m 'Publish built docs triggered by ${{ github.sha }}' + git push || git push --force + fi + # NOTE: PyPI publish needs to be done manually for now after release passed the vote # release: # name: Publish in PyPI From a9d421feacf705837b40f6af35a421d77d2a3759 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sat, 30 Aug 2025 09:49:51 -0400 Subject: [PATCH 5/8] Remove redundant documentatino build --- .github/workflows/docs.yaml | 95 ------------------------------------- 1 file changed, 95 deletions(-) delete mode 100644 .github/workflows/docs.yaml diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml deleted file mode 100644 index c24fa5ade..000000000 --- a/.github/workflows/docs.yaml +++ /dev/null @@ -1,95 +0,0 @@ -on: - push: - branches: - - main - tags-ignore: - - "**-rc**" - pull_request: - branches: - - main - -name: Deploy DataFusion Python site - -jobs: - debug-github-context: - name: Print github context - runs-on: ubuntu-latest - steps: - - name: Dump GitHub context - env: - GITHUB_CONTEXT: ${{ toJson(github) }} - run: | - echo "$GITHUB_CONTEXT" - build-docs: - name: Build docs - runs-on: ubuntu-latest - steps: - - name: Set target branch - if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || github.ref_type == 'tag') - id: target-branch - run: | - set -x - if test '${{ github.ref }}' = 'refs/heads/main'; then - echo "value=asf-staging" >> "$GITHUB_OUTPUT" - elif test '${{ github.ref_type }}' = 'tag'; then - echo "value=asf-site" >> "$GITHUB_OUTPUT" - else - echo "Unsupported input: ${{ github.ref }} / ${{ github.ref_type }}" - exit 1 - fi - - name: Checkout docs sources - uses: actions/checkout@v5 - - name: Checkout docs target branch - if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || github.ref_type == 'tag') - uses: actions/checkout@v5 - with: - fetch-depth: 0 - ref: ${{ steps.target-branch.outputs.value }} - path: docs-target - - name: Setup Python - uses: actions/setup-python@v5 - with: - python-version: "3.11" - - - name: Install Protoc - uses: arduino/setup-protoc@v3 - with: - version: '27.4' - repo-token: ${{ secrets.GITHUB_TOKEN }} - - - name: Install dependencies and build - uses: astral-sh/setup-uv@v6 - with: - enable-cache: true - - - name: Build repo - run: | - uv venv - uv sync --dev --no-install-package datafusion --group docs - uv run --no-project maturin develop --uv - - - name: Build docs - run: | - set -x - cd docs - curl -O https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv - curl -O https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet - uv run --no-project make html - - - name: Copy & push the generated HTML - if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || github.ref_type == 'tag') - run: | - set -x - cd docs-target - # delete anything but: 1) '.'; 2) '..'; 3) .git/ - find ./ | grep -vE "^./$|^../$|^./.git" | xargs rm -rf - cp ../.asf.yaml . - cp -r ../docs/build/html/* . - git status --porcelain - if [ "$(git status --porcelain)" != "" ]; then - git config user.name "github-actions[bot]" - git config user.email "github-actions[bot]@users.noreply.github.com" - git add --all - git commit -m 'Publish built docs triggered by ${{ github.sha }}' - git push || git push --force - fi From 615e2a123f5d89c75433296de3ec7a1e1dc6b903 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sat, 30 Aug 2025 10:39:28 -0400 Subject: [PATCH 6/8] Move parameters into init method to fix documentation error --- python/datafusion/dataframe.py | 181 +++++++++++++++++---------------- 1 file changed, 94 insertions(+), 87 deletions(-) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 61cb09438..e8c26ba33 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -119,68 +119,8 @@ class ParquetWriterOptions: """Advanced parquet writer options. Allows settings the writer options that apply to the entire file. Some options can - also be set on a column by column basis, with the field `column_specific_options` - (see `ParquetColumnOptions`). - - Attributes: - data_pagesize_limit: Sets best effort maximum size of data page in bytes. - write_batch_size: Sets write_batch_size in bytes. - writer_version: Sets parquet writer version. Valid values are `1.0` and - `2.0`. - skip_arrow_metadata: Skip encoding the embedded arrow metadata in the - KV_meta. - compression: Compression type to use. Default is "zstd(3)". - Available compression types are - - "uncompressed": No compression. - - "snappy": Snappy compression. - - "gzip(n)": Gzip compression with level n. - - "brotli(n)": Brotli compression with level n. - - "lz4": LZ4 compression. - - "lz4_raw": LZ4_RAW compression. - - "zstd(n)": Zstandard compression with level n. - dictionary_enabled: Sets if dictionary encoding is enabled. If None, uses - the default parquet writer setting. - dictionary_page_size_limit: Sets best effort maximum dictionary page size, - in bytes. - statistics_enabled: Sets if statistics are enabled for any column Valid - values are `none`, `chunk`, and `page`. If None, uses the default - parquet writer setting. - max_row_group_size: Target maximum number of rows in each row group - (defaults to 1M rows). Writing larger row groups requires more memory to - write, but can get better compression and be faster to read. - created_by: Sets "created by" property. - column_index_truncate_length: Sets column index truncate length. - statistics_truncate_length: Sets statistics truncate length. If None, uses - the default parquet writer setting. - data_page_row_count_limit: Sets best effort maximum number of rows in a data - page. - encoding: Sets default encoding for any column. Valid values are `plain`, - `plain_dictionary`, `rle`, `bit_packed`, `delta_binary_packed`, - `delta_length_byte_array`, `delta_byte_array`, `rle_dictionary`, and - `byte_stream_split`. If None, uses the default parquet writer setting. - bloom_filter_on_write: Write bloom filters for all columns when creating - parquet files. - bloom_filter_fpp: Sets bloom filter false positive probability. If None, - uses the default parquet writer setting - bloom_filter_ndv: Sets bloom filter number of distinct values. If None, uses - the default parquet writer setting. - allow_single_file_parallelism: Controls whether DataFusion will attempt to - speed up writing parquet files by serializing them in parallel. Each - column in each row group in each output file are serialized in parallel - leveraging a maximum possible core count of n_files * n_row_groups * - n_columns. - maximum_parallel_row_group_writers: By default parallel parquet writer is - tuned for minimum memory usage in a streaming execution plan. You may - see a performance benefit when writing large parquet files by increasing - `maximum_parallel_row_group_writers` and - `maximum_buffered_record_batches_per_stream` if your system has idle - cores and can tolerate additional memory usage. Boosting these values is - likely worthwhile when writing out already in-memory data, such as from - a cached data frame. - maximum_buffered_record_batches_per_stream: See - `maximum_parallel_row_group_writers`. - column_specific_options: Overrides options for specific columns. If a column - is not a part of this dictionary, it will use the parameters provided here. + also be set on a column by column basis, with the field ``column_specific_options`` + (see ``ParquetColumnOptions``). """ def __init__( @@ -208,7 +148,72 @@ def __init__( maximum_buffered_record_batches_per_stream: int = 2, column_specific_options: Optional[dict[str, ParquetColumnOptions]] = None, ) -> None: - """Initialize the ParquetWriterOptions.""" + """Initialize the ParquetWriterOptions. + + Args: + data_pagesize_limit: Sets best effort maximum size of data page in bytes. + write_batch_size: Sets write_batch_size in bytes. + writer_version: Sets parquet writer version. Valid values are ``1.0`` and + ``2.0``. + skip_arrow_metadata: Skip encoding the embedded arrow metadata in the + KV_meta. + compression: Compression type to use. Default is ``zstd(3)``. + Available compression types are + + - ``uncompressed``: No compression. + - ``snappy``: Snappy compression. + - ``gzip(n)``: Gzip compression with level n. + - ``brotli(n)``: Brotli compression with level n. + - ``lz4``: LZ4 compression. + - ``lz4_raw``: LZ4_RAW compression. + - ``zstd(n)``: Zstandard compression with level n. + compression_level: Compression level to set. + dictionary_enabled: Sets if dictionary encoding is enabled. If ``None``, + uses the default parquet writer setting. + dictionary_page_size_limit: Sets best effort maximum dictionary page size, + in bytes. + statistics_enabled: Sets if statistics are enabled for any column Valid + values are ``none``, ``chunk``, and ``page``. If ``None``, uses the + default parquet writer setting. + max_row_group_size: Target maximum number of rows in each row group + (defaults to 1M rows). Writing larger row groups requires more memory + to write, but can get better compression and be faster to read. + created_by: Sets "created by" property. + column_index_truncate_length: Sets column index truncate length. + statistics_truncate_length: Sets statistics truncate length. If ``None``, + uses the default parquet writer setting. + data_page_row_count_limit: Sets best effort maximum number of rows in a data + page. + encoding: Sets default encoding for any column. Valid values are ``plain``, + ``plain_dictionary``, ``rle``, ``bit_packed``, ``delta_binary_packed``, + ``delta_length_byte_array``, ``delta_byte_array``, ``rle_dictionary``, + and ``byte_stream_split``. If ``None``, uses the default parquet writer + setting. + bloom_filter_on_write: Write bloom filters for all columns when creating + parquet files. + bloom_filter_fpp: Sets bloom filter false positive probability. If ``None``, + uses the default parquet writer setting + bloom_filter_ndv: Sets bloom filter number of distinct values. If ``None``, + uses the default parquet writer setting. + allow_single_file_parallelism: Controls whether DataFusion will attempt to + speed up writing parquet files by serializing them in parallel. Each + column in each row group in each output file are serialized in parallel + leveraging a maximum possible core count of + ``n_files * n_row_groups * n_columns``. + maximum_parallel_row_group_writers: By default parallel parquet writer is + tuned for minimum memory usage in a streaming execution plan. You may + see a performance benefit when writing large parquet files by increasing + ``maximum_parallel_row_group_writers`` and + ``maximum_buffered_record_batches_per_stream`` if your system has idle + cores and can tolerate additional memory usage. Boosting these values is + likely worthwhile when writing out already in-memory data, such as from + a cached data frame. + maximum_buffered_record_batches_per_stream: See + ``maximum_parallel_row_group_writers``. + column_specific_options: Overrides options for specific columns. If a column + is not a part of this dictionary, it will use the parameters provided + here. + """ self.data_pagesize_limit = data_pagesize_limit self.write_batch_size = write_batch_size self.writer_version = writer_version @@ -241,29 +246,7 @@ class ParquetColumnOptions: """Parquet options for individual columns. Contains the available options that can be applied for an individual Parquet column, - replacing the global options in `ParquetWriterOptions`. - - Attributes: - encoding: Sets encoding for the column path. Valid values are: `plain`, - `plain_dictionary`, `rle`, `bit_packed`, `delta_binary_packed`, - `delta_length_byte_array`, `delta_byte_array`, `rle_dictionary`, and - `byte_stream_split`. These values are not case-sensitive. If `None`, uses - the default parquet options - dictionary_enabled: Sets if dictionary encoding is enabled for the column path. - If `None`, uses the default parquet options - compression: Sets default parquet compression codec for the column path. Valid - values are `uncompressed`, `snappy`, `gzip(level)`, `lzo`, `brotli(level)`, - `lz4`, `zstd(level)`, and `lz4_raw`. These values are not case-sensitive. If - `None`, uses the default parquet options. - statistics_enabled: Sets if statistics are enabled for the column Valid values - are: `none`, `chunk`, and `page` These values are not case sensitive. If - `None`, uses the default parquet options. - bloom_filter_enabled: Sets if bloom filter is enabled for the column path. If - `None`, uses the default parquet options. - bloom_filter_fpp: Sets bloom filter false positive probability for the column - path. If `None`, uses the default parquet options. - bloom_filter_ndv: Sets bloom filter number of distinct values. If `None`, uses - the default parquet options. + replacing the global options in ``ParquetWriterOptions``. """ def __init__( @@ -276,7 +259,31 @@ def __init__( bloom_filter_fpp: Optional[float] = None, bloom_filter_ndv: Optional[int] = None, ) -> None: - """Initialize the ParquetColumnOptions.""" + """Initialize the ParquetColumnOptions. + + Args: + encoding: Sets encoding for the column path. Valid values are: ``plain``, + ``plain_dictionary``, ``rle``, ``bit_packed``, ``delta_binary_packed``, + ``delta_length_byte_array``, ``delta_byte_array``, ``rle_dictionary``, + and ``byte_stream_split``. These values are not case-sensitive. If + ``None``, uses the default parquet options + dictionary_enabled: Sets if dictionary encoding is enabled for the column + path. If `None`, uses the default parquet options + compression: Sets default parquet compression codec for the column path. + Valid values are ``uncompressed``, ``snappy``, ``gzip(level)``, ``lzo``, + ``brotli(level)``, ``lz4``, ``zstd(level)``, and ``lz4_raw``. These + values are not case-sensitive. If ``None``, uses the default parquet + options. + statistics_enabled: Sets if statistics are enabled for the column Valid + values are: ``none``, ``chunk``, and ``page`` These values are not case + sensitive. If ``None``, uses the default parquet options. + bloom_filter_enabled: Sets if bloom filter is enabled for the column path. + If ``None``, uses the default parquet options. + bloom_filter_fpp: Sets bloom filter false positive probability for the + column path. If ``None``, uses the default parquet options. + bloom_filter_ndv: Sets bloom filter number of distinct values. If ``None``, + uses the default parquet options. + """ self.encoding = encoding self.dictionary_enabled = dictionary_enabled self.compression = compression From 0d06c0a196079ffa3e6b16ecd6c46aa0fbdb76f7 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sat, 30 Aug 2025 11:03:27 -0400 Subject: [PATCH 7/8] Whitespace correction --- python/datafusion/dataframe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index e8c26ba33..078bf3d2c 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -272,8 +272,8 @@ def __init__( compression: Sets default parquet compression codec for the column path. Valid values are ``uncompressed``, ``snappy``, ``gzip(level)``, ``lzo``, ``brotli(level)``, ``lz4``, ``zstd(level)``, and ``lz4_raw``. These - values are not case-sensitive. If ``None``, uses the default parquet - options. + values are not case-sensitive. If ``None``, uses the default parquet + options. statistics_enabled: Sets if statistics are enabled for the column Valid values are: ``none``, ``chunk``, and ``page`` These values are not case sensitive. If ``None``, uses the default parquet options. From dc72fb0c7a2ac6126be277ff92a2dee3da50bb76 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sat, 30 Aug 2025 12:45:31 -0400 Subject: [PATCH 8/8] Documentation test will occur in the build docs section and will fail now that the setting is correct to turn warnings into errors --- .github/workflows/test.yaml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 6ff30ac4d..ce50d1bb6 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -80,13 +80,6 @@ jobs: with: enable-cache: true - - name: Check documentation - if: ${{ matrix.python-version == '3.10' && matrix.toolchain == 'stable' }} - run: | - uv sync --dev --group docs --no-install-package datafusion - uv run --no-project maturin develop --uv - uv run --no-project docs/build.sh - - name: Run tests env: RUST_BACKTRACE: 1