Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 84 additions & 41 deletions .github/workflows/registry-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,23 +53,15 @@ on: # yamllint disable-line rule:truthy

permissions:
contents: read
packages: read

jobs:
build-and-publish-registry:
timeout-minutes: 30
name: "Build & Publish Registry"
runs-on: ubuntu-latest
env:
EXISTING_REGISTRY_DIR: /tmp/existing-registry
REGISTRY_DATA_DIR: dev/registry
REGISTRY_PROVIDERS_JSON: providers.json
REGISTRY_MODULES_JSON: modules.json
REGISTRY_SITE_DATA_DIR: registry/src/_data
REGISTRY_SITE_VERSIONS_DIR: registry/src/_data/versions
REGISTRY_SITE_LOGOS_DIR: registry/public/logos
REGISTRY_CACHE_CONTROL: public, max-age=300
build-ci-image:
name: "Build CI image"
uses: ./.github/workflows/ci-image-build.yml
permissions:
contents: read
packages: write
if: >
github.event_name == 'workflow_call' ||
contains(fromJSON('[
Expand All @@ -86,35 +78,51 @@ jobs:
"utkarsharma2",
"vincbeck"
]'), github.event.sender.login)
with:
runners: '["ubuntu-22.04"]'
platform: "linux/amd64"
push-image: "false"
upload-image-artifact: "true"
upload-mount-cache-artifact: "false"
python-versions: '["3.12"]'
branch: "main"
constraints-branch: "constraints-main"
use-uv: "true"
upgrade-to-newer-dependencies: "false"
docker-cache: "registry"
disable-airflow-repo-cache: "false"

build-and-publish-registry:
timeout-minutes: 30
name: "Build & Publish Registry"
needs: [build-ci-image]
runs-on: ubuntu-latest
env:
SCARF_ANALYTICS: "false"
DO_NOT_TRACK: "1"
EXISTING_REGISTRY_DIR: /tmp/existing-registry
REGISTRY_DATA_DIR: dev/registry
REGISTRY_PROVIDERS_JSON: providers.json
REGISTRY_MODULES_JSON: modules.json
REGISTRY_SITE_DATA_DIR: registry/src/_data
REGISTRY_SITE_VERSIONS_DIR: registry/src/_data/versions
REGISTRY_SITE_LOGOS_DIR: registry/public/logos
REGISTRY_CACHE_CONTROL: public, max-age=300
permissions:
contents: read
steps:
- name: "Checkout repository"
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false

# --- Breeze setup ---
# All three extraction scripts run inside breeze so that
# extract_parameters.py and extract_connections.py can import provider
# classes at runtime. extract_metadata.py also runs in breeze for
# consistency — it writes to dev/registry/ (mounted) so the other two
# scripts can read providers.json / modules.json from there.
- name: "Install Breeze"
uses: ./.github/actions/breeze
- name: "Prepare breeze & CI image"
uses: ./.github/actions/prepare_breeze_and_image
with:
python-version: "3.12"

- name: "Build CI image"
# Fallback to raw docker buildx when breeze cache is stale — same
# pattern as publish-docs-to-s3.yml.
run: >
breeze ci-image build --python 3.12 ||
docker buildx build --load --builder default --progress=auto --pull
--build-arg AIRFLOW_EXTRAS=devel-ci --build-arg AIRFLOW_PRE_CACHED_PIP_PACKAGES=false
--build-arg AIRFLOW_USE_UV=true --build-arg BUILD_PROGRESS=auto
--build-arg INSTALL_MYSQL_CLIENT_TYPE=mariadb
--build-arg VERSION_SUFFIX_FOR_PYPI=dev0
-t ghcr.io/apache/airflow/main/ci/python3.12:latest --target main .
-f Dockerfile.ci --platform linux/amd64
python: "3.12"
platform: "linux/amd64"
use-uv: "true"
make-mnt-writeable-and-cleanup: "true"

- name: "Install AWS CLI v2"
run: |
Expand Down Expand Up @@ -195,13 +203,17 @@ jobs:
cp \
"${REGISTRY_DATA_DIR}/${REGISTRY_MODULES_JSON}" \
"${REGISTRY_SITE_DATA_DIR}/${REGISTRY_MODULES_JSON}"
if [ -d "${REGISTRY_DATA_DIR}/output/versions" ]; then
cp -r "${REGISTRY_DATA_DIR}/output/versions/"* "${REGISTRY_SITE_VERSIONS_DIR}/"
VERSIONS_SRC="${REGISTRY_DATA_DIR}/output/versions"
if [ -d "${VERSIONS_SRC}" ] && ls "${VERSIONS_SRC}/"* &>/dev/null; then
cp -r "${VERSIONS_SRC}/"* "${REGISTRY_SITE_VERSIONS_DIR}/"
fi
# Copy provider logos extracted from providers/*/docs/integration-logos/
if [ -d "${REGISTRY_DATA_DIR}/logos" ]; then
# Copy provider logos extracted by extract_metadata.py.
# The directory may exist but be empty (incremental build for
# a provider without logos), so check for files before copying.
LOGOS_SRC="${REGISTRY_DATA_DIR}/logos"
if [ -d "${LOGOS_SRC}" ] && ls "${LOGOS_SRC}/"* &>/dev/null; then
mkdir -p "${REGISTRY_SITE_LOGOS_DIR}"
cp -r "${REGISTRY_DATA_DIR}/logos/"* "${REGISTRY_SITE_LOGOS_DIR}/"
cp -r "${LOGOS_SRC}/"* "${REGISTRY_SITE_LOGOS_DIR}/"
fi

- name: "Setup pnpm"
Expand Down Expand Up @@ -237,10 +249,41 @@ jobs:
- name: "Sync registry to S3"
env:
S3_BUCKET: ${{ steps.destination.outputs.bucket }}
PROVIDER: ${{ inputs.provider }}
run: |
# Incremental builds only extract data for the target provider(s).
# Eleventy rebuilds all pages, but non-target providers have
# incomplete data (no connections/parameters, wrong version info).
# Exclude both per-provider API JSON and HTML pages from the main
# sync, then selectively upload only the target provider's files.
EXCLUDE_PROVIDERS=()
if [[ -n "${PROVIDER}" ]]; then
# Exclude per-provider subtrees but re-include global listing
# pages. AWS CLI processes filters in order — later rules win.
EXCLUDE_PROVIDERS=(
--exclude "api/providers/*"
--exclude "providers/*"
--include "providers/index.html"
)
fi

aws s3 sync registry/_site/ "${S3_BUCKET}" \
--cache-control "${REGISTRY_CACHE_CONTROL}" \
--exclude "pagefind/*"
--exclude "pagefind/*" \
"${EXCLUDE_PROVIDERS[@]}"

# For incremental builds, sync only the target provider's files.
if [[ -n "${PROVIDER}" ]]; then
for pid in ${PROVIDER}; do
aws s3 sync "registry/_site/api/providers/${pid}/" \
"${S3_BUCKET}api/providers/${pid}/" \
--cache-control "${REGISTRY_CACHE_CONTROL}"
aws s3 sync "registry/_site/providers/${pid}/" \
"${S3_BUCKET}providers/${pid}/" \
--cache-control "${REGISTRY_CACHE_CONTROL}"
done
fi

# Pagefind generates content-hashed filenames (e.g. en_181da6f.pf_index).
# Each rebuild produces new hashes, so --delete is needed to remove stale
# index files. This is separate from the main sync which intentionally
Expand Down
4 changes: 2 additions & 2 deletions dev/breeze/src/airflow_breeze/commands/registry_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,8 @@ def extract_data(python: str, provider: str | None):
command = (
f"{install_cmd}"
f"python dev/registry/extract_metadata.py{provider_flag} && "
"python dev/registry/extract_parameters.py && "
"python dev/registry/extract_connections.py"
f"python dev/registry/extract_parameters.py{provider_flag} && "
f"python dev/registry/extract_connections.py{provider_flag}"
)

with ci_group("Extracting registry data"):
Expand Down
18 changes: 10 additions & 8 deletions dev/registry/extract_connections.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ def main():
parser.add_argument(
"--provider",
default=None,
help="Only output connections for this provider ID (e.g. 'amazon').",
help="Only output connections for these provider ID(s) (space-separated, e.g. 'amazon common-io').",
)
parser.add_argument(
"--providers-json",
Expand Down Expand Up @@ -212,12 +212,21 @@ def main():
total_with_custom = 0
total_with_ui = 0

# Parse space-separated provider filter (matches extract_metadata.py behaviour)
provider_filter: set[str] | None = None
if args.provider:
provider_filter = {pid.strip() for pid in args.provider.split() if pid.strip()}
print(f"Filtering to provider(s): {', '.join(sorted(provider_filter))}")

for conn_type, hook_info in sorted(hooks.items()):
if hook_info is None or not hook_info.package_name:
continue

provider_id = package_name_to_provider_id(hook_info.package_name)

if provider_filter and provider_id not in provider_filter:
continue

standard_fields = build_standard_fields(field_behaviours.get(conn_type))
custom_fields = build_custom_fields(form_widgets, conn_type)

Expand All @@ -244,13 +253,6 @@ def main():
print(f" {total_with_custom} have custom fields")
print(f" {total_with_ui} have UI field customisation")

# Filter to single provider if requested
if args.provider:
provider_connections = {
pid: conns for pid, conns in provider_connections.items() if pid == args.provider
}
print(f"Filtering output to provider: {args.provider}")

# Write per-provider files to versions/{pid}/{version}/connections.json
for output_dir in OUTPUT_DIRS:
if not output_dir.parent.exists():
Expand Down
4 changes: 3 additions & 1 deletion dev/registry/merge_registry_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,9 @@ def merge(
existing_modules: list[dict] = []
if existing_modules_path.exists():
existing_modules = json.loads(existing_modules_path.read_text())["modules"]
new_modules = json.loads(new_modules_path.read_text())["modules"]
new_modules: list[dict] = []
if new_modules_path.exists():
new_modules = json.loads(new_modules_path.read_text())["modules"]

# IDs being replaced
new_ids = {p["id"] for p in new_providers}
Expand Down
35 changes: 35 additions & 0 deletions dev/registry/tests/test_merge_registry_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,41 @@ def test_missing_existing_providers_file(self, tmp_path, output_dir):
assert len(result_providers) == 1
assert result_providers[0]["id"] == "amazon"

def test_missing_new_modules_file(self, tmp_path, output_dir):
"""Incremental extract with --provider skips modules.json; merge should keep existing modules."""
existing_providers = _write_json(
tmp_path / "existing_providers.json",
{
"providers": [
_provider("amazon", "Amazon", "2024-01-01"),
_provider("google", "Google", "2024-02-01"),
]
},
)
existing_modules = _write_json(
tmp_path / "existing_modules.json",
{
"modules": [
_module("amazon-s3-op", "amazon"),
_module("google-bq-op", "google"),
]
},
)
new_providers = _write_json(
tmp_path / "new_providers.json",
{"providers": [_provider("amazon", "Amazon", "2025-01-01")]},
)
# new_modules file does not exist (--provider mode skips modules.json)
new_modules = tmp_path / "nonexistent_modules.json"

merge(existing_providers, existing_modules, new_providers, new_modules, output_dir)

result_modules = json.loads((output_dir / "modules.json").read_text())["modules"]
# Existing modules for non-updated providers are kept
assert any(m["id"] == "google-bq-op" for m in result_modules)
# Existing modules for the updated provider are removed (no new ones to replace them)
assert not any(m["provider_id"] == "amazon" for m in result_modules)

def test_output_directory_created_if_missing(self, tmp_path):
output_dir = tmp_path / "does" / "not" / "exist"
existing_providers = _write_json(
Expand Down
5 changes: 4 additions & 1 deletion registry/AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,10 @@ The registry is built in the `apache/airflow` repo and served at `airflow.apache
Supports two modes:
- **Full build** (no `provider` input): extracts all ~99 providers (~12 min)
- **Incremental build** (`provider=amazon`): extracts one provider (~30s), merges
with existing data from S3 via `merge_registry_data.py`, then builds the full site
with existing data from S3 via `merge_registry_data.py`, then builds the full site.
The S3 sync step excludes the entire `api/providers/` subtree for non-target
providers to avoid overwriting real data with Eleventy's incomplete/empty
stubs (Eleventy 3.x `permalink: false` does not work with pagination).
2. **S3 buckets**: `{live|staging}-docs-airflow-apache-org/registry/` (same bucket as docs, different prefix)
3. **Serving**: Apache HTTPD at `airflow.apache.org` rewrites `/registry/*` to CloudFront, which serves from S3
4. **Auto-trigger**: When `publish-docs-to-s3.yml` publishes provider docs, its
Expand Down
20 changes: 17 additions & 3 deletions registry/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -327,17 +327,31 @@ it triggers `registry-build.yml` with the provider ID. The incremental flow:
metadata and PyPI stats; `extract_parameters.py` discovers modules for only the
specified provider.
3. **Merge** — `merge_registry_data.py` replaces the updated provider's entries in
the downloaded JSON while keeping all other providers intact.
the downloaded JSON while keeping all other providers intact. Only global files
(`providers.json`, `modules.json`) are merged — per-version files like
`connections.json` and `parameters.json` are not downloaded from S3.
4. **Build site** — Eleventy builds all pages from the merged data; Pagefind indexes
all records.
5. **S3 sync** — only changed pages are uploaded (S3 sync diffs).
all records. Because per-version data only exists for the target provider, Eleventy
emits empty fallback JSON for other providers' `connections.json` and
`parameters.json` API endpoints (see **Known limitation** below).
5. **S3 sync (selective)** — the main sync excludes the entire `api/providers/`
subtree to avoid overwriting real data with incomplete/empty stubs. A second
sync uploads only the target provider's API files.
6. **Publish versions** — `publish_versions.py` updates `api/providers/{id}/versions.json`.

The merge script (`dev/registry/merge_registry_data.py`) handles edge cases:

- First deploy (no existing data on S3): uses the single-provider output as-is.
- Missing modules file: treated as empty.

**Known limitation**: Eleventy's pagination templates generate API files for every
provider in `providers.json`, even when per-version data (connections, parameters) only
exists for the target provider. The templates emit empty fallback JSON
(`{"connection_types":[]}`) for providers without data. The S3 sync step works around
this with `--exclude` patterns during incremental builds. A proper template-level fix
(skipping file generation) is tracked as a follow-up — `permalink: false` does not work
with Eleventy 3.x pagination templates.

To run an incremental build locally:

```bash
Expand Down
2 changes: 1 addition & 1 deletion registry/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"description": "Apache Airflow Provider Registry",
"scripts": {
"dev": "REGISTRY_PATH_PREFIX=/ pnpm build && REGISTRY_PATH_PREFIX=/ eleventy --serve --port=8080",
"prebuild": "uv run python ../dev/registry/export_registry_schemas.py",
"prebuild": "uv run --project ../dev/registry python ../dev/registry/export_registry_schemas.py",
"build": "rm -rf _site && eleventy",
"postbuild": "cleancss -o _site/css/main.css _site/css/main.css && node scripts/build-pagefind-index.mjs"
},
Expand Down
Loading
Loading