apache · kaxil · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026
@@ -53,23 +53,15 @@ on:  # yamllint disable-line rule:truthy
 
 permissions:
   contents: read
+  packages: read
 
 jobs:
-  build-and-publish-registry:
-    timeout-minutes: 30
-    name: "Build & Publish Registry"
-    runs-on: ubuntu-latest
-    env:
-      EXISTING_REGISTRY_DIR: /tmp/existing-registry
-      REGISTRY_DATA_DIR: dev/registry
-      REGISTRY_PROVIDERS_JSON: providers.json
-      REGISTRY_MODULES_JSON: modules.json
-      REGISTRY_SITE_DATA_DIR: registry/src/_data
-      REGISTRY_SITE_VERSIONS_DIR: registry/src/_data/versions
-      REGISTRY_SITE_LOGOS_DIR: registry/public/logos
-      REGISTRY_CACHE_CONTROL: public, max-age=300
+  build-ci-image:
+    name: "Build CI image"
+    uses: ./.github/workflows/ci-image-build.yml
     permissions:
       contents: read
+      packages: write
     if: >
       github.event_name == 'workflow_call' ||
       contains(fromJSON('[
@@ -86,35 +78,51 @@ jobs:
         "utkarsharma2",
         "vincbeck"
         ]'), github.event.sender.login)
+    with:
+      runners: '["ubuntu-22.04"]'
+      platform: "linux/amd64"
+      push-image: "false"
+      upload-image-artifact: "true"
+      upload-mount-cache-artifact: "false"
+      python-versions: '["3.12"]'
+      branch: "main"
+      constraints-branch: "constraints-main"
+      use-uv: "true"
+      upgrade-to-newer-dependencies: "false"
+      docker-cache: "registry"
+      disable-airflow-repo-cache: "false"
+
+  build-and-publish-registry:
+    timeout-minutes: 30
+    name: "Build & Publish Registry"
+    needs: [build-ci-image]
+    runs-on: ubuntu-latest
+    env:
+      SCARF_ANALYTICS: "false"
+      DO_NOT_TRACK: "1"
+      EXISTING_REGISTRY_DIR: /tmp/existing-registry
+      REGISTRY_DATA_DIR: dev/registry
+      REGISTRY_PROVIDERS_JSON: providers.json
+      REGISTRY_MODULES_JSON: modules.json
+      REGISTRY_SITE_DATA_DIR: registry/src/_data
+      REGISTRY_SITE_VERSIONS_DIR: registry/src/_data/versions
+      REGISTRY_SITE_LOGOS_DIR: registry/public/logos
+      REGISTRY_CACHE_CONTROL: public, max-age=300
+    permissions:
+      contents: read
     steps:
       - name: "Checkout repository"
         uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
         with:
           persist-credentials: false
 
-      # --- Breeze setup ---
-      # All three extraction scripts run inside breeze so that
-      # extract_parameters.py and extract_connections.py can import provider
-      # classes at runtime.  extract_metadata.py also runs in breeze for
-      # consistency — it writes to dev/registry/ (mounted) so the other two
-      # scripts can read providers.json / modules.json from there.
-      - name: "Install Breeze"
-        uses: ./.github/actions/breeze
+      - name: "Prepare breeze & CI image"
+        uses: ./.github/actions/prepare_breeze_and_image
         with:
-          python-version: "3.12"
-
-      - name: "Build CI image"
-        # Fallback to raw docker buildx when breeze cache is stale — same
-        # pattern as publish-docs-to-s3.yml.
-        run: >
-          breeze ci-image build --python 3.12 ||
-          docker buildx build --load --builder default --progress=auto --pull
-          --build-arg AIRFLOW_EXTRAS=devel-ci --build-arg AIRFLOW_PRE_CACHED_PIP_PACKAGES=false
-          --build-arg AIRFLOW_USE_UV=true --build-arg BUILD_PROGRESS=auto
-          --build-arg INSTALL_MYSQL_CLIENT_TYPE=mariadb
-          --build-arg VERSION_SUFFIX_FOR_PYPI=dev0
-          -t ghcr.io/apache/airflow/main/ci/python3.12:latest --target main .
-          -f Dockerfile.ci --platform linux/amd64
+          python: "3.12"
+          platform: "linux/amd64"
+          use-uv: "true"
+          make-mnt-writeable-and-cleanup: "true"
 
       - name: "Install AWS CLI v2"
         run: |
@@ -195,13 +203,17 @@ jobs:
           cp \
             "${REGISTRY_DATA_DIR}/${REGISTRY_MODULES_JSON}" \
             "${REGISTRY_SITE_DATA_DIR}/${REGISTRY_MODULES_JSON}"
-          if [ -d "${REGISTRY_DATA_DIR}/output/versions" ]; then
-            cp -r "${REGISTRY_DATA_DIR}/output/versions/"* "${REGISTRY_SITE_VERSIONS_DIR}/"
+          VERSIONS_SRC="${REGISTRY_DATA_DIR}/output/versions"
+          if [ -d "${VERSIONS_SRC}" ] && ls "${VERSIONS_SRC}/"* &>/dev/null; then
+            cp -r "${VERSIONS_SRC}/"* "${REGISTRY_SITE_VERSIONS_DIR}/"
           fi
-          # Copy provider logos extracted from providers/*/docs/integration-logos/
-          if [ -d "${REGISTRY_DATA_DIR}/logos" ]; then
+          # Copy provider logos extracted by extract_metadata.py.
+          # The directory may exist but be empty (incremental build for
+          # a provider without logos), so check for files before copying.
+          LOGOS_SRC="${REGISTRY_DATA_DIR}/logos"
+          if [ -d "${LOGOS_SRC}" ] && ls "${LOGOS_SRC}/"* &>/dev/null; then
             mkdir -p "${REGISTRY_SITE_LOGOS_DIR}"
-            cp -r "${REGISTRY_DATA_DIR}/logos/"* "${REGISTRY_SITE_LOGOS_DIR}/"
+            cp -r "${LOGOS_SRC}/"* "${REGISTRY_SITE_LOGOS_DIR}/"
           fi
 
       - name: "Setup pnpm"
@@ -237,10 +249,41 @@ jobs:
       - name: "Sync registry to S3"
         env:
           S3_BUCKET: ${{ steps.destination.outputs.bucket }}
+          PROVIDER: ${{ inputs.provider }}
         run: |
+          # Incremental builds only extract data for the target provider(s).
+          # Eleventy rebuilds all pages, but non-target providers have
+          # incomplete data (no connections/parameters, wrong version info).
+          # Exclude both per-provider API JSON and HTML pages from the main
+          # sync, then selectively upload only the target provider's files.
+          EXCLUDE_PROVIDERS=()
+          if [[ -n "${PROVIDER}" ]]; then
+            # Exclude per-provider subtrees but re-include global listing
+            # pages. AWS CLI processes filters in order — later rules win.
+            EXCLUDE_PROVIDERS=(
+              --exclude "api/providers/*"
+              --exclude "providers/*"
+              --include "providers/index.html"
+            )
+          fi
+
           aws s3 sync registry/_site/ "${S3_BUCKET}" \
             --cache-control "${REGISTRY_CACHE_CONTROL}" \
-            --exclude "pagefind/*"
+            --exclude "pagefind/*" \
+            "${EXCLUDE_PROVIDERS[@]}"
+
+          # For incremental builds, sync only the target provider's files.
+          if [[ -n "${PROVIDER}" ]]; then
+            for pid in ${PROVIDER}; do
+              aws s3 sync "registry/_site/api/providers/${pid}/" \
+                "${S3_BUCKET}api/providers/${pid}/" \
+                --cache-control "${REGISTRY_CACHE_CONTROL}"
+              aws s3 sync "registry/_site/providers/${pid}/" \
+                "${S3_BUCKET}providers/${pid}/" \
+                --cache-control "${REGISTRY_CACHE_CONTROL}"
+            done
+          fi
+
           # Pagefind generates content-hashed filenames (e.g. en_181da6f.pf_index).
           # Each rebuild produces new hashes, so --delete is needed to remove stale
           # index files. This is separate from the main sync which intentionally

@@ -91,8 +91,8 @@ def extract_data(python: str, provider: str | None):
     command = (
         f"{install_cmd}"
         f"python dev/registry/extract_metadata.py{provider_flag} && "
-        "python dev/registry/extract_parameters.py && "
-        "python dev/registry/extract_connections.py"
+        f"python dev/registry/extract_parameters.py{provider_flag} && "
+        f"python dev/registry/extract_connections.py{provider_flag}"
     )
 
     with ci_group("Extracting registry data"):

@@ -162,7 +162,7 @@ def main():
     parser.add_argument(
         "--provider",
         default=None,
-        help="Only output connections for this provider ID (e.g. 'amazon').",
+        help="Only output connections for these provider ID(s) (space-separated, e.g. 'amazon common-io').",
     )
     parser.add_argument(
         "--providers-json",
@@ -212,12 +212,21 @@ def main():
     total_with_custom = 0
     total_with_ui = 0
 
+    # Parse space-separated provider filter (matches extract_metadata.py behaviour)
+    provider_filter: set[str] | None = None
+    if args.provider:
+        provider_filter = {pid.strip() for pid in args.provider.split() if pid.strip()}
+        print(f"Filtering to provider(s): {', '.join(sorted(provider_filter))}")
+
     for conn_type, hook_info in sorted(hooks.items()):
         if hook_info is None or not hook_info.package_name:
             continue
 
         provider_id = package_name_to_provider_id(hook_info.package_name)
 
+        if provider_filter and provider_id not in provider_filter:
+            continue
+
         standard_fields = build_standard_fields(field_behaviours.get(conn_type))
         custom_fields = build_custom_fields(form_widgets, conn_type)
 
@@ -244,13 +253,6 @@ def main():
     print(f"  {total_with_custom} have custom fields")
     print(f"  {total_with_ui} have UI field customisation")
 
-    # Filter to single provider if requested
-    if args.provider:
-        provider_connections = {
-            pid: conns for pid, conns in provider_connections.items() if pid == args.provider
-        }
-        print(f"Filtering output to provider: {args.provider}")
-
     # Write per-provider files to versions/{pid}/{version}/connections.json
     for output_dir in OUTPUT_DIRS:
         if not output_dir.parent.exists():

@@ -56,7 +56,9 @@ def merge(
     existing_modules: list[dict] = []
     if existing_modules_path.exists():
         existing_modules = json.loads(existing_modules_path.read_text())["modules"]
-    new_modules = json.loads(new_modules_path.read_text())["modules"]
+    new_modules: list[dict] = []
+    if new_modules_path.exists():
+        new_modules = json.loads(new_modules_path.read_text())["modules"]
 
     # IDs being replaced
     new_ids = {p["id"] for p in new_providers}

@@ -241,6 +241,41 @@ def test_missing_existing_providers_file(self, tmp_path, output_dir):
         assert len(result_providers) == 1
         assert result_providers[0]["id"] == "amazon"
 
+    def test_missing_new_modules_file(self, tmp_path, output_dir):
+        """Incremental extract with --provider skips modules.json; merge should keep existing modules."""
+        existing_providers = _write_json(
+            tmp_path / "existing_providers.json",
+            {
+                "providers": [
+                    _provider("amazon", "Amazon", "2024-01-01"),
+                    _provider("google", "Google", "2024-02-01"),
+                ]
+            },
+        )
+        existing_modules = _write_json(
+            tmp_path / "existing_modules.json",
+            {
+                "modules": [
+                    _module("amazon-s3-op", "amazon"),
+                    _module("google-bq-op", "google"),
+                ]
+            },
+        )
+        new_providers = _write_json(
+            tmp_path / "new_providers.json",
+            {"providers": [_provider("amazon", "Amazon", "2025-01-01")]},
+        )
+        # new_modules file does not exist (--provider mode skips modules.json)
+        new_modules = tmp_path / "nonexistent_modules.json"
+
+        merge(existing_providers, existing_modules, new_providers, new_modules, output_dir)
+
+        result_modules = json.loads((output_dir / "modules.json").read_text())["modules"]
+        # Existing modules for non-updated providers are kept
+        assert any(m["id"] == "google-bq-op" for m in result_modules)
+        # Existing modules for the updated provider are removed (no new ones to replace them)
+        assert not any(m["provider_id"] == "amazon" for m in result_modules)
+
     def test_output_directory_created_if_missing(self, tmp_path):
         output_dir = tmp_path / "does" / "not" / "exist"
         existing_providers = _write_json(

@@ -404,7 +404,10 @@ The registry is built in the `apache/airflow` repo and served at `airflow.apache
    Supports two modes:
    - **Full build** (no `provider` input): extracts all ~99 providers (~12 min)
    - **Incremental build** (`provider=amazon`): extracts one provider (~30s), merges
-     with existing data from S3 via `merge_registry_data.py`, then builds the full site
+     with existing data from S3 via `merge_registry_data.py`, then builds the full site.
+     The S3 sync step excludes the entire `api/providers/` subtree for non-target
+     providers to avoid overwriting real data with Eleventy's incomplete/empty
+     stubs (Eleventy 3.x `permalink: false` does not work with pagination).
 2. **S3 buckets**: `{live|staging}-docs-airflow-apache-org/registry/` (same bucket as docs, different prefix)
 3. **Serving**: Apache HTTPD at `airflow.apache.org` rewrites `/registry/*` to CloudFront, which serves from S3
 4. **Auto-trigger**: When `publish-docs-to-s3.yml` publishes provider docs, its

@@ -327,17 +327,31 @@ it triggers `registry-build.yml` with the provider ID. The incremental flow:
    metadata and PyPI stats; `extract_parameters.py` discovers modules for only the
    specified provider.
 3. **Merge** — `merge_registry_data.py` replaces the updated provider's entries in
-   the downloaded JSON while keeping all other providers intact.
+   the downloaded JSON while keeping all other providers intact. Only global files
+   (`providers.json`, `modules.json`) are merged — per-version files like
+   `connections.json` and `parameters.json` are not downloaded from S3.
 4. **Build site** — Eleventy builds all pages from the merged data; Pagefind indexes
-   all records.
-5. **S3 sync** — only changed pages are uploaded (S3 sync diffs).
+   all records. Because per-version data only exists for the target provider, Eleventy
+   emits empty fallback JSON for other providers' `connections.json` and
+   `parameters.json` API endpoints (see **Known limitation** below).
+5. **S3 sync (selective)** — the main sync excludes the entire `api/providers/`
+   subtree to avoid overwriting real data with incomplete/empty stubs. A second
+   sync uploads only the target provider's API files.
 6. **Publish versions** — `publish_versions.py` updates `api/providers/{id}/versions.json`.
 
 The merge script (`dev/registry/merge_registry_data.py`) handles edge cases:
 
 - First deploy (no existing data on S3): uses the single-provider output as-is.
 - Missing modules file: treated as empty.
 
+**Known limitation**: Eleventy's pagination templates generate API files for every
+provider in `providers.json`, even when per-version data (connections, parameters) only
+exists for the target provider. The templates emit empty fallback JSON
+(`{"connection_types":[]}`) for providers without data. The S3 sync step works around
+this with `--exclude` patterns during incremental builds. A proper template-level fix
+(skipping file generation) is tracked as a follow-up — `permalink: false` does not work
+with Eleventy 3.x pagination templates.
+
 To run an incremental build locally:
 
 ```bash

@@ -4,7 +4,7 @@
   "description": "Apache Airflow Provider Registry",
   "scripts": {
     "dev": "REGISTRY_PATH_PREFIX=/ pnpm build && REGISTRY_PATH_PREFIX=/ eleventy --serve --port=8080",
-    "prebuild": "uv run python ../dev/registry/export_registry_schemas.py",
+    "prebuild": "uv run --project ../dev/registry python ../dev/registry/export_registry_schemas.py",
     "build": "rm -rf _site && eleventy",
     "postbuild": "cleancss -o _site/css/main.css _site/css/main.css && node scripts/build-pagefind-index.mjs"
   },