diff --git a/.github/workflows/registry-build.yml b/.github/workflows/registry-build.yml index 9fca3e61d6bd8..d6e323ae14232 100644 --- a/.github/workflows/registry-build.yml +++ b/.github/workflows/registry-build.yml @@ -115,6 +115,11 @@ jobs: uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false + # Tags drive the phantom-version filter in extract_metadata.py + # (only versions with a real `providers-/` tag are + # treated as released). Without this, the filter silently + # falls back to `versions[0]` and ships phantom versions. + fetch-tags: true - name: "Prepare breeze & CI image" uses: ./.github/actions/prepare_breeze_and_image @@ -176,11 +181,20 @@ jobs: - name: "Extract registry data (breeze)" env: PROVIDER: ${{ inputs.provider }} + DESTINATION: ${{ inputs.destination }} run: | + # Staging dispatches preview unreleased providers (maintainers want to + # verify newly-bumped versions look right before tagging). Live builds + # filter them so the production registry never ships pointers to + # non-existent PyPI releases / GitHub tags / docs pages. + ALLOW_UNRELEASED="" + if [[ "${DESTINATION}" == "staging" ]]; then + ALLOW_UNRELEASED="--allow-unreleased" + fi if [[ -n "${PROVIDER}" ]]; then - breeze registry extract-data --python 3.12 --provider "${PROVIDER}" + breeze registry extract-data --python 3.12 --provider "${PROVIDER}" ${ALLOW_UNRELEASED} else - breeze registry extract-data --python 3.12 + breeze registry extract-data --python 3.12 ${ALLOW_UNRELEASED} fi # --- Incremental: merge new data with existing --- diff --git a/dev/breeze/doc/images/output_registry_extract-data.svg b/dev/breeze/doc/images/output_registry_extract-data.svg index c1c5d148f7b4f..c7dda404a5663 100644 --- a/dev/breeze/doc/images/output_registry_extract-data.svg +++ b/dev/breeze/doc/images/output_registry_extract-data.svg @@ -1,4 +1,4 @@ - + - + @@ -87,9 +88,18 @@ + + + + + + + + + - Command: registry extract-data + Command: registry extract-data @@ -105,15 +115,18 @@ Extract provider metadata, parameters, and connection types for the registry. ╭─ Extract data flags ─────────────────────────────────────────────────────────────────────────────────────────────────╮ ---python  -pPython major/minor version used in Airflow image for images. [default: 3.10](>3.10< | 3.11 | 3.12 | -3.13 | 3.14) ---providerExtract only this provider ID (e.g. 'amazon'). Omit for full build. (TEXT) -╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -╭─ Common options ─────────────────────────────────────────────────────────────────────────────────────────────────────╮ ---verbose-vPrint verbose information about performed steps. ---dry-run-DIf dry-run is set, commands are only printed, not executed. ---help   -hShow this message and exit. -╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +--python          -pPython major/minor version used in Airflow image for images. [default: 3.10](>3.10< | 3.11  +| 3.12 | 3.13 | 3.14) +--provider        Extract only this provider ID (e.g. 'amazon'). Omit for full build. (TEXT) +--allow-unreleasedInclude providers and versions that don't have a matching providers-<id>/<ver> git tag. Use  +for staging builds and local dev where maintainers want to preview unreleased provider pages +before the tag lands. Forwarded to extract_metadata.py.                                      +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Common options ─────────────────────────────────────────────────────────────────────────────────────────────────────╮ +--verbose-vPrint verbose information about performed steps. +--dry-run-DIf dry-run is set, commands are only printed, not executed. +--help   -hShow this message and exit. +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ diff --git a/dev/breeze/doc/images/output_registry_extract-data.txt b/dev/breeze/doc/images/output_registry_extract-data.txt index a5b0e70c052a4..be322d5035541 100644 --- a/dev/breeze/doc/images/output_registry_extract-data.txt +++ b/dev/breeze/doc/images/output_registry_extract-data.txt @@ -1 +1 @@ -602ea508f9bcf0d5c2f97a220f5ee6d2 +5ece18e98af19619094e0ee3c439b73b diff --git a/dev/breeze/src/airflow_breeze/commands/registry_commands.py b/dev/breeze/src/airflow_breeze/commands/registry_commands.py index 4d7457aec723b..edbb27f22326d 100644 --- a/dev/breeze/src/airflow_breeze/commands/registry_commands.py +++ b/dev/breeze/src/airflow_breeze/commands/registry_commands.py @@ -68,9 +68,20 @@ def registry_group(): default=None, help="Extract only this provider ID (e.g. 'amazon'). Omit for full build.", ) +@click.option( + "--allow-unreleased", + is_flag=True, + default=False, + help=( + "Include providers and versions that don't have a matching " + "providers-/ git tag. Use for staging builds and local dev " + "where maintainers want to preview unreleased provider pages before " + "the tag lands. Forwarded to extract_metadata.py." + ), +) @option_verbose @option_dry_run -def extract_data(python: str, provider: str | None): +def extract_data(python: str, provider: str | None, allow_unreleased: bool): unique_project_name = f"breeze-registry-{uuid.uuid4().hex[:8]}" shell_params = ShellParams( @@ -89,9 +100,13 @@ def extract_data(python: str, provider: str | None): install_cmd = f"pip install --quiet {' '.join(suspended_packages)} && " if suspended_packages else "" provider_flag = f" --provider '{provider}'" if provider else "" + # --allow-unreleased only applies to extract_metadata.py (which owns the + # version filter). The other two scripts read from providers.json and + # don't need it. + metadata_extra = " --allow-unreleased" if allow_unreleased else "" command = ( f"{install_cmd}" - f"python dev/registry/extract_metadata.py{provider_flag} && " + f"python dev/registry/extract_metadata.py{provider_flag}{metadata_extra} && " f"python dev/registry/extract_parameters.py{provider_flag} && " f"python dev/registry/extract_connections.py{provider_flag}" ) diff --git a/dev/breeze/src/airflow_breeze/commands/registry_commands_config.py b/dev/breeze/src/airflow_breeze/commands/registry_commands_config.py index f37cc46913b0b..d7b440367542f 100644 --- a/dev/breeze/src/airflow_breeze/commands/registry_commands_config.py +++ b/dev/breeze/src/airflow_breeze/commands/registry_commands_config.py @@ -32,6 +32,7 @@ "options": [ "--python", "--provider", + "--allow-unreleased", ], }, ], diff --git a/dev/registry/extract_metadata.py b/dev/registry/extract_metadata.py index 2f7ba8afeb021..7c0f91d55c2c5 100644 --- a/dev/registry/extract_metadata.py +++ b/dev/registry/extract_metadata.py @@ -35,6 +35,7 @@ import json import re import shutil +import subprocess import urllib.request import zlib from dataclasses import asdict, dataclass, field @@ -359,6 +360,48 @@ def find_related_providers(provider_id: str, all_provider_yamls: dict[str, dict] return related[:5] # Limit to 5 related providers +def load_release_tags() -> set[str]: + """Return all ``providers-/`` git tags as a set for fast lookup. + + Used to filter ``provider.yaml`` ``versions:`` lists to only entries that + correspond to a real release (excludes phantom version bumps where the + next-version entry was prepended to ``versions:`` before the tag landed, + or pre-release-only versions like ``providers-celery/3.19.0rc1`` where the + ``rc1`` exists but the final does not). + + Returns an empty set if the ``git`` command fails (e.g., outside a checkout); + callers can decide whether to fall back to the unfiltered top entry. + """ + try: + result = subprocess.run( + ["git", "tag", "--list", "providers-*"], + capture_output=True, + text=True, + cwd=AIRFLOW_ROOT, + check=True, + ) + except (subprocess.CalledProcessError, FileNotFoundError): + return set() + return {line.strip() for line in result.stdout.splitlines() if line.strip()} + + +def find_latest_released_version( + provider_id: str, + versions_list: list[str], + release_tags: set[str], +) -> str | None: + """Walk ``versions_list`` newest-first, return the first version with a real release tag. + + Returns ``None`` when no entry in ``versions_list`` has a corresponding + ``providers-/`` tag, indicating the provider is unreleased + (brand-new in-tree, no tags yet) or in an inconsistent state. + """ + for version in versions_list: + if f"providers-{provider_id}/{version}" in release_tags: + return version + return None + + def main(): """Main extraction function.""" import argparse @@ -369,6 +412,17 @@ def main(): default=None, help="Extract only this provider ID (e.g. 'amazon'). Omit for full build.", ) + parser.add_argument( + "--allow-unreleased", + action="store_true", + help=( + "Include providers and versions that don't have a matching " + "providers-/ git tag. Use for staging builds and local dev " + "where maintainers want to preview unreleased provider pages before " + "the tag lands. Default is to filter unreleased entries so live " + "builds don't ship phantom pointers." + ), + ) args = parser.parse_args() print("Airflow Registry Metadata Extractor") @@ -378,6 +432,8 @@ def main(): print(f"Incremental mode: extracting provider(s) {requested_providers}") else: requested_providers = None + if args.allow_unreleased: + print("Unreleased providers: INCLUDED (--allow-unreleased)") # Ensure output directory exists OUTPUT_DIR.mkdir(parents=True, exist_ok=True) @@ -415,6 +471,27 @@ def main(): else: extraction_ids = set(all_provider_yamls.keys()) + # Load all release tags once. Used below to filter `provider.yaml`'s + # `versions:` to only entries that have a real `providers-/` + # git tag, avoiding phantom-version leaks (next-release bumps prepended + # to `versions:` before the tag lands, RC-only releases, brand-new + # providers with no tags yet). + # + # Skipped entirely when --allow-unreleased is set: staging builds and + # local dev want to preview unreleased provider pages so maintainers + # can verify them before tagging. + if args.allow_unreleased: + release_tags: set[str] = set() + else: + release_tags = load_release_tags() + if not release_tags: + print( + " Warning: no providers-* git tags found; " + "phantom version filter is disabled (falling back to versions[0]). " + "If this is a CI run, ensure the checkout step uses fetch-tags: true." + ) + skipped_unreleased: list[str] = [] + # Second pass: Extract full metadata (only for providers in extraction_ids) for provider_id in extraction_ids: provider_yaml = all_provider_yamls[provider_id] @@ -447,9 +524,29 @@ def main(): if len(description) > 200: description = description[:197] + "..." - # Get versions - versions = provider_yaml.get("versions", []) - version = versions[0] if versions else "0.0.0" + # Get versions, filtering to entries that have a real release tag. + # Provider release prep prepends the next version to `versions:` BEFORE + # the tag lands, and pre-release-only versions match `versions:` but + # have no final tag. Without filtering, `version` (the latest pointer) + # AND the `versions` list both leak phantoms downstream -- the latter + # is consumed by extract_versions.py's backfill, which would try to + # `git show` from a non-existent tag. + raw_versions = provider_yaml.get("versions", []) + if release_tags: + versions = [v for v in raw_versions if f"providers-{provider_id}/{v}" in release_tags] + version = find_latest_released_version(provider_id, raw_versions, release_tags) + if version is None: + skipped_unreleased.append(provider_id) + print( + f" Skipping {provider_id}: no released version found in " + f"versions list {raw_versions} " + f"(no matching providers-{provider_id}/ tag)" + ) + continue + else: + # No tag information available -- fall back to old behaviour. + versions = list(raw_versions) + version = versions[0] if versions else "0.0.0" # Extract categories from integrations categories = extract_integrations_as_categories(provider_yaml) @@ -603,6 +700,12 @@ def main(): all_providers.append(provider) print(f" {provider_id}: {len(categories)} categories") + if skipped_unreleased: + print( + f"\nSkipped {len(skipped_unreleased)} unreleased provider(s) " + f"(no matching git tag): {sorted(skipped_unreleased)}" + ) + # Find related providers for provider in all_providers: provider.related_providers = find_related_providers(provider.id, all_provider_yamls) diff --git a/dev/registry/tests/test_extract_metadata.py b/dev/registry/tests/test_extract_metadata.py index 5bbccd7cb9bc1..df7b2de71b5ce 100644 --- a/dev/registry/tests/test_extract_metadata.py +++ b/dev/registry/tests/test_extract_metadata.py @@ -31,7 +31,9 @@ fetch_provider_inventory, fetch_pypi_dates, fetch_pypi_downloads, + find_latest_released_version, find_related_providers, + load_release_tags, module_path_to_file_path, parse_pyproject_toml, read_connection_urls, @@ -568,3 +570,145 @@ def test_tableau_resolves(self): conn_map = {"tableau": "connections/tableau.html"} url = resolve_connection_docs_url("tableau", conn_map, self.BASE) assert url == f"{self.BASE}/connections/tableau.html" + + +# --------------------------------------------------------------------------- +# load_release_tags +# --------------------------------------------------------------------------- +class TestLoadReleaseTags: + def test_parses_subprocess_output(self): + from unittest.mock import MagicMock, patch + + mock_result = MagicMock() + mock_result.stdout = ( + "providers-amazon/9.25.0\n" + "providers-amazon/9.26.0\n" + "providers-celery/3.18.0\n" + "providers-celery/3.19.0rc1\n" + "\n" # blank line + " providers-google/21.2.0 \n" # whitespace tolerated + ) + with patch("extract_metadata.subprocess.run", return_value=mock_result) as mock_run: + tags = load_release_tags() + + assert tags == { + "providers-amazon/9.25.0", + "providers-amazon/9.26.0", + "providers-celery/3.18.0", + "providers-celery/3.19.0rc1", + "providers-google/21.2.0", + } + # The git command runs against the providers-* glob + cmd = mock_run.call_args[0][0] + assert cmd[:3] == ["git", "tag", "--list"] + assert cmd[3] == "providers-*" + + def test_returns_empty_set_on_subprocess_failure(self): + from subprocess import CalledProcessError + from unittest.mock import patch + + with patch( + "extract_metadata.subprocess.run", + side_effect=CalledProcessError(1, ["git", "tag", "--list"]), + ): + tags = load_release_tags() + assert tags == set() + + def test_returns_empty_set_when_git_not_installed(self): + from unittest.mock import patch + + with patch("extract_metadata.subprocess.run", side_effect=FileNotFoundError): + tags = load_release_tags() + assert tags == set() + + +# --------------------------------------------------------------------------- +# find_latest_released_version +# --------------------------------------------------------------------------- +class TestFindLatestReleasedVersion: + def test_returns_top_when_top_has_tag(self): + tags = {"providers-amazon/9.26.0", "providers-amazon/9.25.0"} + assert find_latest_released_version("amazon", ["9.26.0", "9.25.0"], tags) == "9.26.0" + + def test_walks_past_phantom_top(self): + # celery 3.19.0 is in versions: but no final tag -- only rc1. + tags = { + "providers-celery/3.19.0rc1", + "providers-celery/3.18.0", + "providers-celery/3.17.2", + } + result = find_latest_released_version("celery", ["3.19.0", "3.18.0", "3.17.2"], tags) + assert result == "3.18.0" + + def test_returns_none_when_no_versions_have_tags(self): + # akeyless: brand-new provider, listed in versions: but never tagged. + tags = {"providers-amazon/9.26.0"} # different provider + result = find_latest_released_version("akeyless", ["1.0.0"], tags) + assert result is None + + def test_returns_none_for_empty_versions_list(self): + result = find_latest_released_version("amazon", [], {"providers-amazon/9.26.0"}) + assert result is None + + def test_rc_only_treated_as_phantom(self): + # Final 3.19.0 is missing; rc1/rc2 exist. Final must match exactly. + tags = {"providers-celery/3.19.0rc1", "providers-celery/3.19.0rc2", "providers-celery/3.18.0"} + # versions: list contains only the would-be final + result = find_latest_released_version("celery", ["3.19.0"], tags) + assert result is None + # When fallback also exists in versions, returns the fallback + result = find_latest_released_version("celery", ["3.19.0", "3.18.0"], tags) + assert result == "3.18.0" + + def test_does_not_match_other_providers_tags(self): + # provider id is part of the tag prefix; pure version coincidence shouldn't match + tags = {"providers-google/9.26.0"} + result = find_latest_released_version("amazon", ["9.26.0"], tags) + assert result is None + + +# --------------------------------------------------------------------------- +# Filter behaviour applied to the `versions` list (not just the latest pointer) +# --------------------------------------------------------------------------- +class TestVersionsListFiltering: + """Regression test for the bug where Provider.version (singular) was + filtered to a real release but Provider.versions (list) still contained + phantom entries. Downstream consumers like extract_versions.py read the + list and would chase non-existent backfill tags. + """ + + def test_filter_drops_phantom_top_from_list(self): + # This mirrors the in-loop logic. We don't have to test main() + # end-to-end -- the filter is a single comprehension that we can + # exercise directly to lock in the contract. + provider_id = "celery" + raw_versions = ["3.19.0", "3.18.0", "3.17.2"] + release_tags = { + "providers-celery/3.19.0rc1", # not the final + "providers-celery/3.18.0", + "providers-celery/3.17.2", + } + filtered = [v for v in raw_versions if f"providers-{provider_id}/{v}" in release_tags] + assert filtered == ["3.18.0", "3.17.2"] + # And the latest pointer agrees + assert find_latest_released_version(provider_id, raw_versions, release_tags) == "3.18.0" + + def test_filter_drops_unreleased_provider(self): + provider_id = "akeyless" + raw_versions = ["1.0.0"] + release_tags = {"providers-amazon/9.26.0"} # different provider + filtered = [v for v in raw_versions if f"providers-{provider_id}/{v}" in release_tags] + assert filtered == [] + assert find_latest_released_version(provider_id, raw_versions, release_tags) is None + + def test_filter_preserves_order(self): + provider_id = "amazon" + raw_versions = ["9.27.0", "9.26.0", "9.25.0", "9.24.0"] # 9.27.0 phantom + release_tags = { + "providers-amazon/9.26.0", + "providers-amazon/9.25.0", + "providers-amazon/9.24.0", + } + filtered = [v for v in raw_versions if f"providers-{provider_id}/{v}" in release_tags] + # Order from raw_versions is preserved; only the phantom is dropped + assert filtered == ["9.26.0", "9.25.0", "9.24.0"]