diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index bc770bbc9..97ece3d42 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -271,7 +271,100 @@ jobs: with: name: dist pattern: dist-* - + + # Documentation build job that runs after wheels are built + build-docs: + name: Build docs + runs-on: ubuntu-latest + needs: [build-manylinux-x86_64] # Only need the Linux wheel for docs + # Only run docs on main branch pushes, tags, or PRs + if: github.event_name == 'push' || github.event_name == 'pull_request' + steps: + - name: Set target branch + if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || github.ref_type == 'tag') + id: target-branch + run: | + set -x + if test '${{ github.ref }}' = 'refs/heads/main'; then + echo "value=asf-staging" >> "$GITHUB_OUTPUT" + elif test '${{ github.ref_type }}' = 'tag'; then + echo "value=asf-site" >> "$GITHUB_OUTPUT" + else + echo "Unsupported input: ${{ github.ref }} / ${{ github.ref_type }}" + exit 1 + fi + + - name: Checkout docs sources + uses: actions/checkout@v5 + + - name: Checkout docs target branch + if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || github.ref_type == 'tag') + uses: actions/checkout@v5 + with: + fetch-depth: 0 + ref: ${{ steps.target-branch.outputs.value }} + path: docs-target + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install dependencies + uses: astral-sh/setup-uv@v6 + with: + enable-cache: true + + # Download the Linux wheel built in the previous job + - name: Download pre-built Linux wheel + uses: actions/download-artifact@v5 + with: + name: dist-manylinux-x86_64 + path: wheels/ + + # Install from the pre-built wheel + - name: Install from pre-built wheel + run: | + set -x + uv venv + # Install documentation dependencies + uv sync --dev --no-install-package datafusion --group docs + # Install the pre-built wheel + WHEEL=$(find wheels/ -name "*.whl" | head -1) + if [ -n "$WHEEL" ]; then + echo "Installing wheel: $WHEEL" + uv pip install "$WHEEL" + else + echo "ERROR: No wheel found!" + exit 1 + fi + + - name: Build docs + run: | + set -x + cd docs + curl -O https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv + curl -O https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet + uv run --no-project make html + + - name: Copy & push the generated HTML + if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || github.ref_type == 'tag') + run: | + set -x + cd docs-target + # delete anything but: 1) '.'; 2) '..'; 3) .git/ + find ./ | grep -vE "^./$|^../$|^./.git" | xargs rm -rf + cp ../.asf.yaml . + cp -r ../docs/build/html/* . + git status --porcelain + if [ "$(git status --porcelain)" != "" ]; then + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git add --all + git commit -m 'Publish built docs triggered by ${{ github.sha }}' + git push || git push --force + fi + # NOTE: PyPI publish needs to be done manually for now after release passed the vote # release: # name: Publish in PyPI diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml deleted file mode 100644 index c24fa5ade..000000000 --- a/.github/workflows/docs.yaml +++ /dev/null @@ -1,95 +0,0 @@ -on: - push: - branches: - - main - tags-ignore: - - "**-rc**" - pull_request: - branches: - - main - -name: Deploy DataFusion Python site - -jobs: - debug-github-context: - name: Print github context - runs-on: ubuntu-latest - steps: - - name: Dump GitHub context - env: - GITHUB_CONTEXT: ${{ toJson(github) }} - run: | - echo "$GITHUB_CONTEXT" - build-docs: - name: Build docs - runs-on: ubuntu-latest - steps: - - name: Set target branch - if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || github.ref_type == 'tag') - id: target-branch - run: | - set -x - if test '${{ github.ref }}' = 'refs/heads/main'; then - echo "value=asf-staging" >> "$GITHUB_OUTPUT" - elif test '${{ github.ref_type }}' = 'tag'; then - echo "value=asf-site" >> "$GITHUB_OUTPUT" - else - echo "Unsupported input: ${{ github.ref }} / ${{ github.ref_type }}" - exit 1 - fi - - name: Checkout docs sources - uses: actions/checkout@v5 - - name: Checkout docs target branch - if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || github.ref_type == 'tag') - uses: actions/checkout@v5 - with: - fetch-depth: 0 - ref: ${{ steps.target-branch.outputs.value }} - path: docs-target - - name: Setup Python - uses: actions/setup-python@v5 - with: - python-version: "3.11" - - - name: Install Protoc - uses: arduino/setup-protoc@v3 - with: - version: '27.4' - repo-token: ${{ secrets.GITHUB_TOKEN }} - - - name: Install dependencies and build - uses: astral-sh/setup-uv@v6 - with: - enable-cache: true - - - name: Build repo - run: | - uv venv - uv sync --dev --no-install-package datafusion --group docs - uv run --no-project maturin develop --uv - - - name: Build docs - run: | - set -x - cd docs - curl -O https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv - curl -O https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet - uv run --no-project make html - - - name: Copy & push the generated HTML - if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || github.ref_type == 'tag') - run: | - set -x - cd docs-target - # delete anything but: 1) '.'; 2) '..'; 3) .git/ - find ./ | grep -vE "^./$|^../$|^./.git" | xargs rm -rf - cp ../.asf.yaml . - cp -r ../docs/build/html/* . - git status --porcelain - if [ "$(git status --porcelain)" != "" ]; then - git config user.name "github-actions[bot]" - git config user.email "github-actions[bot]@users.noreply.github.com" - git add --all - git commit -m 'Publish built docs triggered by ${{ github.sha }}' - git push || git push --force - fi diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 6ff30ac4d..ce50d1bb6 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -80,13 +80,6 @@ jobs: with: enable-cache: true - - name: Check documentation - if: ${{ matrix.python-version == '3.10' && matrix.toolchain == 'stable' }} - run: | - uv sync --dev --group docs --no-install-package datafusion - uv run --no-project maturin develop --uv - uv run --no-project docs/build.sh - - name: Run tests env: RUST_BACKTRACE: 1 diff --git a/docs/Makefile b/docs/Makefile index e65c8e250..49ebae372 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -35,4 +35,4 @@ help: # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) --fail-on-warning \ No newline at end of file diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 61cb09438..078bf3d2c 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -119,68 +119,8 @@ class ParquetWriterOptions: """Advanced parquet writer options. Allows settings the writer options that apply to the entire file. Some options can - also be set on a column by column basis, with the field `column_specific_options` - (see `ParquetColumnOptions`). - - Attributes: - data_pagesize_limit: Sets best effort maximum size of data page in bytes. - write_batch_size: Sets write_batch_size in bytes. - writer_version: Sets parquet writer version. Valid values are `1.0` and - `2.0`. - skip_arrow_metadata: Skip encoding the embedded arrow metadata in the - KV_meta. - compression: Compression type to use. Default is "zstd(3)". - Available compression types are - - "uncompressed": No compression. - - "snappy": Snappy compression. - - "gzip(n)": Gzip compression with level n. - - "brotli(n)": Brotli compression with level n. - - "lz4": LZ4 compression. - - "lz4_raw": LZ4_RAW compression. - - "zstd(n)": Zstandard compression with level n. - dictionary_enabled: Sets if dictionary encoding is enabled. If None, uses - the default parquet writer setting. - dictionary_page_size_limit: Sets best effort maximum dictionary page size, - in bytes. - statistics_enabled: Sets if statistics are enabled for any column Valid - values are `none`, `chunk`, and `page`. If None, uses the default - parquet writer setting. - max_row_group_size: Target maximum number of rows in each row group - (defaults to 1M rows). Writing larger row groups requires more memory to - write, but can get better compression and be faster to read. - created_by: Sets "created by" property. - column_index_truncate_length: Sets column index truncate length. - statistics_truncate_length: Sets statistics truncate length. If None, uses - the default parquet writer setting. - data_page_row_count_limit: Sets best effort maximum number of rows in a data - page. - encoding: Sets default encoding for any column. Valid values are `plain`, - `plain_dictionary`, `rle`, `bit_packed`, `delta_binary_packed`, - `delta_length_byte_array`, `delta_byte_array`, `rle_dictionary`, and - `byte_stream_split`. If None, uses the default parquet writer setting. - bloom_filter_on_write: Write bloom filters for all columns when creating - parquet files. - bloom_filter_fpp: Sets bloom filter false positive probability. If None, - uses the default parquet writer setting - bloom_filter_ndv: Sets bloom filter number of distinct values. If None, uses - the default parquet writer setting. - allow_single_file_parallelism: Controls whether DataFusion will attempt to - speed up writing parquet files by serializing them in parallel. Each - column in each row group in each output file are serialized in parallel - leveraging a maximum possible core count of n_files * n_row_groups * - n_columns. - maximum_parallel_row_group_writers: By default parallel parquet writer is - tuned for minimum memory usage in a streaming execution plan. You may - see a performance benefit when writing large parquet files by increasing - `maximum_parallel_row_group_writers` and - `maximum_buffered_record_batches_per_stream` if your system has idle - cores and can tolerate additional memory usage. Boosting these values is - likely worthwhile when writing out already in-memory data, such as from - a cached data frame. - maximum_buffered_record_batches_per_stream: See - `maximum_parallel_row_group_writers`. - column_specific_options: Overrides options for specific columns. If a column - is not a part of this dictionary, it will use the parameters provided here. + also be set on a column by column basis, with the field ``column_specific_options`` + (see ``ParquetColumnOptions``). """ def __init__( @@ -208,7 +148,72 @@ def __init__( maximum_buffered_record_batches_per_stream: int = 2, column_specific_options: Optional[dict[str, ParquetColumnOptions]] = None, ) -> None: - """Initialize the ParquetWriterOptions.""" + """Initialize the ParquetWriterOptions. + + Args: + data_pagesize_limit: Sets best effort maximum size of data page in bytes. + write_batch_size: Sets write_batch_size in bytes. + writer_version: Sets parquet writer version. Valid values are ``1.0`` and + ``2.0``. + skip_arrow_metadata: Skip encoding the embedded arrow metadata in the + KV_meta. + compression: Compression type to use. Default is ``zstd(3)``. + Available compression types are + + - ``uncompressed``: No compression. + - ``snappy``: Snappy compression. + - ``gzip(n)``: Gzip compression with level n. + - ``brotli(n)``: Brotli compression with level n. + - ``lz4``: LZ4 compression. + - ``lz4_raw``: LZ4_RAW compression. + - ``zstd(n)``: Zstandard compression with level n. + compression_level: Compression level to set. + dictionary_enabled: Sets if dictionary encoding is enabled. If ``None``, + uses the default parquet writer setting. + dictionary_page_size_limit: Sets best effort maximum dictionary page size, + in bytes. + statistics_enabled: Sets if statistics are enabled for any column Valid + values are ``none``, ``chunk``, and ``page``. If ``None``, uses the + default parquet writer setting. + max_row_group_size: Target maximum number of rows in each row group + (defaults to 1M rows). Writing larger row groups requires more memory + to write, but can get better compression and be faster to read. + created_by: Sets "created by" property. + column_index_truncate_length: Sets column index truncate length. + statistics_truncate_length: Sets statistics truncate length. If ``None``, + uses the default parquet writer setting. + data_page_row_count_limit: Sets best effort maximum number of rows in a data + page. + encoding: Sets default encoding for any column. Valid values are ``plain``, + ``plain_dictionary``, ``rle``, ``bit_packed``, ``delta_binary_packed``, + ``delta_length_byte_array``, ``delta_byte_array``, ``rle_dictionary``, + and ``byte_stream_split``. If ``None``, uses the default parquet writer + setting. + bloom_filter_on_write: Write bloom filters for all columns when creating + parquet files. + bloom_filter_fpp: Sets bloom filter false positive probability. If ``None``, + uses the default parquet writer setting + bloom_filter_ndv: Sets bloom filter number of distinct values. If ``None``, + uses the default parquet writer setting. + allow_single_file_parallelism: Controls whether DataFusion will attempt to + speed up writing parquet files by serializing them in parallel. Each + column in each row group in each output file are serialized in parallel + leveraging a maximum possible core count of + ``n_files * n_row_groups * n_columns``. + maximum_parallel_row_group_writers: By default parallel parquet writer is + tuned for minimum memory usage in a streaming execution plan. You may + see a performance benefit when writing large parquet files by increasing + ``maximum_parallel_row_group_writers`` and + ``maximum_buffered_record_batches_per_stream`` if your system has idle + cores and can tolerate additional memory usage. Boosting these values is + likely worthwhile when writing out already in-memory data, such as from + a cached data frame. + maximum_buffered_record_batches_per_stream: See + ``maximum_parallel_row_group_writers``. + column_specific_options: Overrides options for specific columns. If a column + is not a part of this dictionary, it will use the parameters provided + here. + """ self.data_pagesize_limit = data_pagesize_limit self.write_batch_size = write_batch_size self.writer_version = writer_version @@ -241,29 +246,7 @@ class ParquetColumnOptions: """Parquet options for individual columns. Contains the available options that can be applied for an individual Parquet column, - replacing the global options in `ParquetWriterOptions`. - - Attributes: - encoding: Sets encoding for the column path. Valid values are: `plain`, - `plain_dictionary`, `rle`, `bit_packed`, `delta_binary_packed`, - `delta_length_byte_array`, `delta_byte_array`, `rle_dictionary`, and - `byte_stream_split`. These values are not case-sensitive. If `None`, uses - the default parquet options - dictionary_enabled: Sets if dictionary encoding is enabled for the column path. - If `None`, uses the default parquet options - compression: Sets default parquet compression codec for the column path. Valid - values are `uncompressed`, `snappy`, `gzip(level)`, `lzo`, `brotli(level)`, - `lz4`, `zstd(level)`, and `lz4_raw`. These values are not case-sensitive. If - `None`, uses the default parquet options. - statistics_enabled: Sets if statistics are enabled for the column Valid values - are: `none`, `chunk`, and `page` These values are not case sensitive. If - `None`, uses the default parquet options. - bloom_filter_enabled: Sets if bloom filter is enabled for the column path. If - `None`, uses the default parquet options. - bloom_filter_fpp: Sets bloom filter false positive probability for the column - path. If `None`, uses the default parquet options. - bloom_filter_ndv: Sets bloom filter number of distinct values. If `None`, uses - the default parquet options. + replacing the global options in ``ParquetWriterOptions``. """ def __init__( @@ -276,7 +259,31 @@ def __init__( bloom_filter_fpp: Optional[float] = None, bloom_filter_ndv: Optional[int] = None, ) -> None: - """Initialize the ParquetColumnOptions.""" + """Initialize the ParquetColumnOptions. + + Args: + encoding: Sets encoding for the column path. Valid values are: ``plain``, + ``plain_dictionary``, ``rle``, ``bit_packed``, ``delta_binary_packed``, + ``delta_length_byte_array``, ``delta_byte_array``, ``rle_dictionary``, + and ``byte_stream_split``. These values are not case-sensitive. If + ``None``, uses the default parquet options + dictionary_enabled: Sets if dictionary encoding is enabled for the column + path. If `None`, uses the default parquet options + compression: Sets default parquet compression codec for the column path. + Valid values are ``uncompressed``, ``snappy``, ``gzip(level)``, ``lzo``, + ``brotli(level)``, ``lz4``, ``zstd(level)``, and ``lz4_raw``. These + values are not case-sensitive. If ``None``, uses the default parquet + options. + statistics_enabled: Sets if statistics are enabled for the column Valid + values are: ``none``, ``chunk``, and ``page`` These values are not case + sensitive. If ``None``, uses the default parquet options. + bloom_filter_enabled: Sets if bloom filter is enabled for the column path. + If ``None``, uses the default parquet options. + bloom_filter_fpp: Sets bloom filter false positive probability for the + column path. If ``None``, uses the default parquet options. + bloom_filter_ndv: Sets bloom filter number of distinct values. If ``None``, + uses the default parquet options. + """ self.encoding = encoding self.dictionary_enabled = dictionary_enabled self.compression = compression