From 8a706b1f20d463802cc377f1ac2fe169904e414c Mon Sep 17 00:00:00 2001 From: Kaxil Naik Date: Tue, 14 Apr 2026 19:57:20 +0100 Subject: [PATCH] Fix registry backfill with per-provider versions and Docker extraction (#65223) Chain both extraction scripts in a single uv run invocation to avoid creating two ephemeral environments per version. (cherry picked from commit cda493d1b600d80ba5306eadd8a2fc9e1703b106) --- .github/workflows/registry-backfill.yml | 116 ++++++---- dev/breeze/doc/images/output_registry.svg | 8 +- dev/breeze/doc/images/output_registry.txt | 2 +- .../doc/images/output_registry_backfill.svg | 52 +++-- .../doc/images/output_registry_backfill.txt | 2 +- .../commands/registry_commands.py | 211 +++++++++++++++--- .../commands/registry_commands_config.py | 2 + dev/registry/.gitignore | 1 + 8 files changed, 293 insertions(+), 101 deletions(-) diff --git a/.github/workflows/registry-backfill.yml b/.github/workflows/registry-backfill.yml index 65da127365fff..30819447ca28c 100644 --- a/.github/workflows/registry-backfill.yml +++ b/.github/workflows/registry-backfill.yml @@ -29,23 +29,54 @@ on: # yamllint disable-line rule:truthy - staging - live default: staging - providers: + provider-versions: description: > - Space-separated provider IDs - (e.g. 'amazon google databricks') - required: true - type: string - versions: - description: > - Space-separated versions to backfill - (e.g. '9.15.0 9.14.0'). Applied to ALL providers. + Space-separated provider/version pairs + (e.g. 'amazon/9.24.0 google/21.0.0 celery/3.17.2'). + Multiple versions per provider are grouped into one job. required: true type: string permissions: contents: read + packages: read jobs: + build-ci-image: + name: "Build CI image" + uses: ./.github/workflows/ci-image-build.yml + permissions: + contents: read + packages: write + if: > + contains(fromJSON('[ + "ashb", + "bugraoz93", + "eladkal", + "ephraimbuddy", + "jedcunningham", + "jscheffl", + "kaxil", + "pierrejeambrun", + "shahar1", + "potiuk", + "utkarsharma2", + "vincbeck" + ]'), github.event.sender.login) + with: + runners: '["ubuntu-22.04"]' + platform: "linux/amd64" + push-image: "false" + upload-image-artifact: "true" + upload-mount-cache-artifact: "false" + python-versions: '["3.12"]' + branch: "main" + constraints-branch: "constraints-main" + use-uv: "true" + upgrade-to-newer-dependencies: "false" + docker-cache: "registry" + disable-airflow-repo-cache: "false" + prepare: runs-on: ubuntu-latest outputs: @@ -55,12 +86,19 @@ jobs: - name: "Build provider matrix" id: matrix env: - PROVIDERS: ${{ inputs.providers }} + PROVIDER_VERSIONS: ${{ inputs.provider-versions }} run: | - MATRIX=$(echo "${PROVIDERS}" \ - | tr ' ' '\n' | jq -R . \ - | jq -cs '{"provider": .}') + # Parse provider/version pairs, group by provider + # Input: "amazon/9.24.0 google/21.0.0 amazon/9.23.0" + # Output: {"include": [{"provider":"amazon","versions":"9.24.0 9.23.0"}, ...]} + MATRIX=$(echo "${PROVIDER_VERSIONS}" | tr ' ' '\n' | grep '/' | \ + jq -R 'split("/") | {provider: .[0], version: .[1]}' | \ + jq -cs 'group_by(.provider) | map({ + provider: .[0].provider, + versions: (map(.version) | join(" ")) + }) | {include: .}') echo "matrix=${MATRIX}" >> "${GITHUB_OUTPUT}" + echo "Matrix: ${MATRIX}" - name: "Determine S3 destination" id: destination @@ -76,28 +114,16 @@ jobs: >> "${GITHUB_OUTPUT}" backfill: - needs: prepare + needs: [prepare, build-ci-image] runs-on: ubuntu-latest timeout-minutes: 60 strategy: fail-fast: false matrix: ${{ fromJSON(needs.prepare.outputs.matrix) }} - name: "Backfill ${{ matrix.provider }}" - if: > - contains(fromJSON('[ - "ashb", - "bugraoz93", - "eladkal", - "ephraimbuddy", - "jedcunningham", - "jscheffl", - "kaxil", - "pierrejeambrun", - "shahar1", - "potiuk", - "utkarsharma2", - "vincbeck" - ]'), github.event.sender.login) + name: "Backfill ${{ matrix.provider }} (${{ matrix.versions }})" + permissions: + contents: read + packages: read steps: - name: "Checkout repository" uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -107,23 +133,23 @@ jobs: - name: "Fetch provider tags" env: - VERSIONS: ${{ inputs.versions }} + VERSIONS: ${{ matrix.versions }} PROVIDER: ${{ matrix.provider }} run: | for VERSION in ${VERSIONS}; do TAG="providers-${PROVIDER}/${VERSION}" echo "Fetching tag: ${TAG}" git fetch origin tag "${TAG}" \ - 2>/dev/null || echo "Tag not found" + 2>/dev/null || echo "Tag not found: ${TAG}" done - - name: "Install uv" - uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0 - - - name: "Install Breeze" - uses: ./.github/actions/breeze + - name: "Prepare breeze & CI image" + uses: ./.github/actions/prepare_breeze_and_image with: - python-version: "3.12" + python: "3.12" + platform: "linux/amd64" + use-uv: "true" + make-mnt-writeable-and-cleanup: "true" - name: "Install AWS CLI v2" run: | @@ -152,7 +178,7 @@ jobs: - name: "Extract version metadata from git tags" env: - VERSIONS: ${{ inputs.versions }} + VERSIONS: ${{ matrix.versions }} PROVIDER: ${{ matrix.provider }} run: | VERSION_ARGS="" @@ -164,7 +190,7 @@ jobs: - name: "Run breeze registry backfill" env: - VERSIONS: ${{ inputs.versions }} + VERSIONS: ${{ matrix.versions }} PROVIDER: ${{ matrix.provider }} run: | VERSION_ARGS="" @@ -172,7 +198,7 @@ jobs: VERSION_ARGS="${VERSION_ARGS} --version ${VERSION}" done breeze registry backfill \ - --provider "${PROVIDER}" ${VERSION_ARGS} + --provider "${PROVIDER}" --python 3.12 ${VERSION_ARGS} - name: "Download data files from S3 for build" env: @@ -186,12 +212,12 @@ jobs: registry/src/_data/modules.json - name: "Setup pnpm" - uses: pnpm/action-setup@8912a9102ac27614460f54aedde9e1e7f9aec20d # v6.0.5 + uses: pnpm/action-setup@fc06bc1257f339d1d5d8b3a19a8cae5388b55320 # v5.0.0 with: - version: 9 + version: 10 - name: "Setup Node.js" - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 + uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f # v6.3.0 with: node-version: 24 cache: 'pnpm' @@ -211,7 +237,7 @@ jobs: env: S3_BUCKET: ${{ needs.prepare.outputs.bucket }} CACHE_CONTROL: "public, max-age=300" - VERSIONS: ${{ inputs.versions }} + VERSIONS: ${{ matrix.versions }} PROVIDER: ${{ matrix.provider }} run: | for VERSION in ${VERSIONS}; do diff --git a/dev/breeze/doc/images/output_registry.svg b/dev/breeze/doc/images/output_registry.svg index 951851010e777..d836e0dd52c4e 100644 --- a/dev/breeze/doc/images/output_registry.svg +++ b/dev/breeze/doc/images/output_registry.svg @@ -114,10 +114,10 @@ ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ ╭─ Registry commands ──────────────────────────────────────────────────────────────────────────────────────────────────╮ extract-data    Extract provider metadata, parameters, and connection types for the registry.                      -backfill        Extract runtime parameters and connections for older provider versions. Uses 'uv run --with' to    -install the specific version in a temporary environment and runs extract_parameters.py +           -extract_connections.py. No Docker needed. Each version uses an isolated providers.json, so         -multiple providers can be backfilled in parallel from separate terminal sessions.                  +backfill        Extract metadata, parameters, and connections for older provider versions. Runs                    +extract_versions.py (host, git tags) for metadata.json, then extract_parameters.py +               +extract_connections.py inside the Breeze CI container (or via 'uv run --with' with --no-docker).   +Each version uses an isolated providers.json, so multiple providers can be backfilled in parallel. publish-versionsPublish per-provider versions.json to S3 from deployed directories. Same pattern as 'breeze        release-management publish-docs-to-s3'.                                                            ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ diff --git a/dev/breeze/doc/images/output_registry.txt b/dev/breeze/doc/images/output_registry.txt index fadd741e8b654..8ae38cfd5e5a3 100644 --- a/dev/breeze/doc/images/output_registry.txt +++ b/dev/breeze/doc/images/output_registry.txt @@ -1 +1 @@ -8c9be6264d33af7facd1fbdf435697b7 +27b4df2c81ed8e0d4c566e552e13bb6a diff --git a/dev/breeze/doc/images/output_registry_backfill.svg b/dev/breeze/doc/images/output_registry_backfill.svg index 4478565366e98..92f5a0586cd7c 100644 --- a/dev/breeze/doc/images/output_registry_backfill.svg +++ b/dev/breeze/doc/images/output_registry_backfill.svg @@ -1,4 +1,4 @@ - + - + @@ -98,9 +98,18 @@ + + + + + + + + + - Command: registry backfill + Command: registry backfill @@ -113,21 +122,24 @@ Usage:breeze registry backfill[OPTIONS] -Extract runtime parameters and connections for older provider versions. Uses 'uv run --with' to install the specific  -version in a temporary environment and runs extract_parameters.py + extract_connections.py. No Docker needed. Each  -version uses an isolated providers.json, so multiple providers can be backfilled in parallel from separate terminal  -sessions. +Extract metadata, parameters, and connections for older provider versions. Runs extract_versions.py (host, git tags)  +for metadata.json, then extract_parameters.py + extract_connections.py inside the Breeze CI container (or via 'uv run  +--with' with --no-docker). Each version uses an isolated providers.json, so multiple providers can be backfilled in  +parallel. ╭─ Backfill flags ─────────────────────────────────────────────────────────────────────────────────────────────────────╮ -*--providerProvider ID (e.g. 'amazon', 'google', 'microsoft-azure'). [required](TEXT) -*--version Version(s) to extract. Can be specified multiple times: --version 9.21.0 --version 9.20.0 [required] -(TEXT) -╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -╭─ Common options ─────────────────────────────────────────────────────────────────────────────────────────────────────╮ ---verbose-vPrint verbose information about performed steps. ---dry-run-DIf dry-run is set, commands are only printed, not executed. ---help   -hShow this message and exit. -╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +--python                -pPython major/minor version used in Airflow image for images. [default: 3.10] +(>3.10< | 3.11 | 3.12 | 3.13 | 3.14) +*--provider              Provider ID (e.g. 'amazon', 'google', 'microsoft-azure'). [required](TEXT) +*--version               Version(s) to extract. Can be specified multiple times: --version 9.21.0 --version +9.20.0 [required](TEXT) +--use-docker/--no-dockerRun extraction in CI Docker container (default) or via uv on host. +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Common options ─────────────────────────────────────────────────────────────────────────────────────────────────────╮ +--verbose-vPrint verbose information about performed steps. +--dry-run-DIf dry-run is set, commands are only printed, not executed. +--help   -hShow this message and exit. +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ diff --git a/dev/breeze/doc/images/output_registry_backfill.txt b/dev/breeze/doc/images/output_registry_backfill.txt index cff872551dbe0..6a923c1535fdd 100644 --- a/dev/breeze/doc/images/output_registry_backfill.txt +++ b/dev/breeze/doc/images/output_registry_backfill.txt @@ -1 +1 @@ -5cddc0e9c5f9524a7e1baf6c21d74263 +88cde3ee57bf72e6fb436203f64e6a14 diff --git a/dev/breeze/src/airflow_breeze/commands/registry_commands.py b/dev/breeze/src/airflow_breeze/commands/registry_commands.py index 9fb5825642824..fac9a7fedd0f8 100644 --- a/dev/breeze/src/airflow_breeze/commands/registry_commands.py +++ b/dev/breeze/src/airflow_breeze/commands/registry_commands.py @@ -17,6 +17,7 @@ from __future__ import annotations import json +import shutil import sys import tempfile import uuid @@ -34,6 +35,23 @@ from airflow_breeze.utils.path_utils import AIRFLOW_ROOT_PATH from airflow_breeze.utils.run_utils import run_command +PROVIDERS_DIR = AIRFLOW_ROOT_PATH / "providers" + + +def _get_suspended_provider_packages() -> list[str]: + """Return in-container pip-installable paths for providers with state: suspended.""" + packages = [] + for yaml_path in sorted(PROVIDERS_DIR.rglob("provider.yaml")): + if "src" in yaml_path.relative_to(PROVIDERS_DIR).parts: + continue + with open(yaml_path) as f: + data = yaml.safe_load(f) + if data.get("state") == "suspended": + # Use in-container path (providers/ is mounted at /opt/airflow/providers/) + rel = yaml_path.parent.relative_to(PROVIDERS_DIR) + packages.append(f"/opt/airflow/providers/{rel}") + return packages + @click.group(cls=BreezeGroup, name="registry", help="Tools for the Airflow Provider Registry") def registry_group(): @@ -65,11 +83,17 @@ def extract_data(python: str, provider: str | None): rebuild_or_pull_ci_image_if_needed(command_params=shell_params) + # Install suspended providers that aren't in the CI image so runtime + # discovery (issubclass) can find their classes. + suspended_packages = _get_suspended_provider_packages() + install_cmd = f"pip install --quiet {' '.join(suspended_packages)} && " if suspended_packages else "" + provider_flag = f" --provider '{provider}'" if provider else "" command = ( + f"{install_cmd}" f"python dev/registry/extract_metadata.py{provider_flag} && " - "python dev/registry/extract_parameters.py && " - "python dev/registry/extract_connections.py" + f"python dev/registry/extract_parameters.py{provider_flag} && " + f"python dev/registry/extract_connections.py{provider_flag}" ) with ci_group("Extracting registry data"): @@ -108,7 +132,6 @@ def publish_versions(s3_bucket: str, providers_json: str | None): _publish_versions(s3_bucket, providers_json_path=providers_path) -PROVIDERS_DIR = AIRFLOW_ROOT_PATH / "providers" DEV_REGISTRY_DIR = AIRFLOW_ROOT_PATH / "dev" / "registry" EXTRACT_SCRIPTS = [ @@ -138,7 +161,7 @@ def _read_provider_yaml_info(provider_id: str) -> tuple[str, list[str]]: try: import tomllib except ImportError: - import tomli as tomllib + import tomli as tomllib # type: ignore[no-redef] provider_yaml_path = _find_provider_yaml(provider_id) with open(provider_yaml_path) as f: @@ -220,14 +243,131 @@ def _run_extract_script( return result.returncode +def _backfill_docker( + python: str, + provider: str, + versions: tuple[str, ...], + package_name: str, + extras: list[str], +) -> list[str]: + """Run parameter/connection extraction inside the Breeze CI container.""" + failed: list[str] = [] + unique_project_name = f"breeze-backfill-{uuid.uuid4().hex[:8]}" + + shell_params = ShellParams( + python=python, + project_name=unique_project_name, + quiet=True, + skip_environment_initialization=True, + extra_args=(), + ) + + rebuild_or_pull_ci_image_if_needed(command_params=shell_params) + + # Place isolated providers.json under dev/registry/ so it's visible inside the container + # at /opt/airflow/dev/registry/ + backfill_tmp_dir = DEV_REGISTRY_DIR / ".backfill_tmp" + backfill_tmp_dir.mkdir(exist_ok=True) + + try: + for version in versions: + click.echo(f"{'=' * 60}") + click.echo(f"Extracting {provider} {version} (Docker)") + click.echo(f"{'=' * 60}") + + providers_json = _create_isolated_providers_json( + provider, package_name, version, backfill_tmp_dir + ) + container_providers_json = f"/opt/airflow/dev/registry/.backfill_tmp/{providers_json.name}" + + pip_spec = _build_pip_spec(package_name, extras, version) + base_spec = f"{package_name}=={version}" + + command = ( + f"cd dev/registry && " + f"uv run --with '{pip_spec}' bash -c '" + f"python extract_parameters.py " + f"--provider {provider} --providers-json {container_providers_json} && " + f"python extract_connections.py " + f"--provider {provider} --providers-json {container_providers_json}'" + ) + + result = execute_command_in_shell( + shell_params=shell_params, + project_name=unique_project_name, + command=command, + preserve_backend=True, + ) + + if result.returncode != 0 and pip_spec != base_spec: + click.echo(f"Retrying without extras ({base_spec})...") + command_fallback = ( + f"cd dev/registry && " + f"uv run --with '{base_spec}' bash -c '" + f"python extract_parameters.py " + f"--provider {provider} --providers-json {container_providers_json} && " + f"python extract_connections.py " + f"--provider {provider} --providers-json {container_providers_json}'" + ) + result = execute_command_in_shell( + shell_params=shell_params, + project_name=unique_project_name, + command=command_fallback, + preserve_backend=True, + ) + + if result.returncode != 0: + click.echo(f"WARNING: extraction failed for {version} (exit {result.returncode})") + failed.append(f"{version}/docker-extraction") + finally: + shutil.rmtree(backfill_tmp_dir, ignore_errors=True) + fix_ownership_using_docker() + + return failed + + +def _backfill_uv( + provider: str, + versions: tuple[str, ...], + package_name: str, + extras: list[str], +) -> list[str]: + """Run parameter/connection extraction via 'uv run --with' on the host.""" + failed: list[str] = [] + + with tempfile.TemporaryDirectory(prefix=f"backfill-{provider}-") as tmp_dir: + tmp_path = Path(tmp_dir) + + for version in versions: + click.echo(f"{'=' * 60}") + click.echo(f"Extracting {provider} {version} (uv)") + click.echo(f"{'=' * 60}") + + providers_json = _create_isolated_providers_json(provider, package_name, version, tmp_path) + + pip_spec = _build_pip_spec(package_name, extras, version) + base_spec = f"{package_name}=={version}" + + for script in EXTRACT_SCRIPTS: + click.echo(f"\nRunning {script.name} with {pip_spec}...") + returncode = _run_extract_script(script, pip_spec, base_spec, provider, providers_json) + if returncode != 0: + click.echo(f"WARNING: {script.name} failed for {version} (exit {returncode})") + failed.append(f"{version}/{script.name}") + + return failed + + @registry_group.command( name="backfill", - help="Extract runtime parameters and connections for older provider versions. " - "Uses 'uv run --with' to install the specific version in a temporary environment " - "and runs extract_parameters.py + extract_connections.py. No Docker needed. " - "Each version uses an isolated providers.json, so multiple providers can be " - "backfilled in parallel from separate terminal sessions.", + help="Extract metadata, parameters, and connections for older provider versions. " + "Runs extract_versions.py (host, git tags) for metadata.json, then " + "extract_parameters.py + extract_connections.py inside the Breeze CI container " + "(or via 'uv run --with' with --no-docker). " + "Each version uses an isolated providers.json, so " + "multiple providers can be backfilled in parallel.", ) +@option_python @click.option( "--provider", required=True, @@ -240,39 +380,49 @@ def _run_extract_script( multiple=True, help="Version(s) to extract. Can be specified multiple times: --version 9.21.0 --version 9.20.0", ) +@click.option( + "--use-docker/--no-docker", + default=True, + help="Run extraction in CI Docker container (default) or via uv on host.", +) @option_verbose @option_dry_run -def backfill(provider: str, versions: tuple[str, ...]): +def backfill(python: str, provider: str, versions: tuple[str, ...], use_docker: bool): package_name, extras = _read_provider_yaml_info(provider) click.echo(f"Provider: {provider} ({package_name})") click.echo(f"Versions: {', '.join(versions)}") + click.echo(f"Mode: {'Docker' if use_docker else 'uv (host)'}") if extras: click.echo(f"Extras: {', '.join(extras)}") click.echo() failed: list[str] = [] - with tempfile.TemporaryDirectory(prefix=f"backfill-{provider}-") as tmp_dir: - tmp_path = Path(tmp_dir) - - for version in versions: - click.echo(f"{'=' * 60}") - click.echo(f"Extracting {provider} {version}") - click.echo(f"{'=' * 60}") - - # Each version gets its own isolated providers.json — no shared state - providers_json = _create_isolated_providers_json(provider, package_name, version, tmp_path) - - pip_spec = _build_pip_spec(package_name, extras, version) - base_spec = f"{package_name}=={version}" - - for script in EXTRACT_SCRIPTS: - click.echo(f"\nRunning {script.name} with {pip_spec}...") - returncode = _run_extract_script(script, pip_spec, base_spec, provider, providers_json) - if returncode != 0: - click.echo(f"WARNING: {script.name} failed for {version} (exit {returncode})") - failed.append(f"{version}/{script.name}") + # Step 1: extract_versions.py (host, reads git tags) -> metadata.json + click.echo("Step 1: Extracting version metadata from git tags...") + for version in versions: + versions_cmd = [ + "uv", + "run", + "python", + str(DEV_REGISTRY_DIR / "extract_versions.py"), + "--provider", + provider, + "--version", + version, + ] + result = run_command(versions_cmd, check=False, cwd=str(AIRFLOW_ROOT_PATH)) + if result.returncode != 0: + click.echo(f"WARNING: extract_versions.py failed for {version} (exit {result.returncode})") + failed.append(f"{version}/extract_versions.py") + + # Step 2: extract_parameters.py + extract_connections.py + click.echo("\nStep 2: Extracting parameters and connections...") + if use_docker: + failed.extend(_backfill_docker(python, provider, versions, package_name, extras)) + else: + failed.extend(_backfill_uv(provider, versions, package_name, extras)) click.echo(f"\n{'=' * 60}") if failed: @@ -282,6 +432,7 @@ def backfill(provider: str, versions: tuple[str, ...]): click.echo(f"Successfully extracted {len(versions)} version(s) for {provider}") click.echo( f"\nOutput written to:\n" + f" registry/src/_data/versions/{provider}//metadata.json\n" f" registry/src/_data/versions/{provider}//parameters.json\n" f" registry/src/_data/versions/{provider}//connections.json" ) diff --git a/dev/breeze/src/airflow_breeze/commands/registry_commands_config.py b/dev/breeze/src/airflow_breeze/commands/registry_commands_config.py index fdd156d45a34b..f37cc46913b0b 100644 --- a/dev/breeze/src/airflow_breeze/commands/registry_commands_config.py +++ b/dev/breeze/src/airflow_breeze/commands/registry_commands_config.py @@ -39,8 +39,10 @@ { "name": "Backfill flags", "options": [ + "--python", "--provider", "--version", + "--use-docker", ], }, ], diff --git a/dev/registry/.gitignore b/dev/registry/.gitignore index 893bb46fa432b..41879335feca8 100644 --- a/dev/registry/.gitignore +++ b/dev/registry/.gitignore @@ -1,4 +1,5 @@ .backfill-logs/ +.backfill_tmp/ .inventory_cache/ output/ runtime_modules.json