diff --git a/src/adcp/compat/legacy/v2_5/_media_buy_helpers.py b/src/adcp/compat/legacy/v2_5/_media_buy_helpers.py index cb49f206..3d6b361c 100644 --- a/src/adcp/compat/legacy/v2_5/_media_buy_helpers.py +++ b/src/adcp/compat/legacy/v2_5/_media_buy_helpers.py @@ -14,17 +14,22 @@ from typing import Any -from adcp.compat.legacy.v2_5._url import strip_url_scheme +from adcp.compat.legacy.v2_5._url import extract_brand_domain def adapt_brand_manifest_to_brand(payload: dict[str, Any]) -> dict[str, Any]: """Rewrite v2.5 ``brand_manifest`` (URL string) to v3 ``brand`` - ``{domain: ...}``. Caller-supplied ``brand`` wins when both fields - are present (half-migrated buyer).""" + ``{domain: ...}``. Caller-supplied ``brand`` wins when both fields + are present (half-migrated buyer). + + Uses ``extract_brand_domain`` to isolate the hostname from full URLs + (e.g. ``"https://acme.com/.well-known/brand.json"`` → ``"acme.com"``) + so the result satisfies ``BrandReference.domain``'s hostname-only regex. + """ out = dict(payload) manifest = out.pop("brand_manifest", None) if isinstance(manifest, str) and manifest and "brand" not in out: - out["brand"] = {"domain": strip_url_scheme(manifest)} + out["brand"] = {"domain": extract_brand_domain(manifest)} return out diff --git a/src/adcp/compat/legacy/v2_5/_url.py b/src/adcp/compat/legacy/v2_5/_url.py index 23aa913b..1c2e0f09 100644 --- a/src/adcp/compat/legacy/v2_5/_url.py +++ b/src/adcp/compat/legacy/v2_5/_url.py @@ -8,6 +8,8 @@ from __future__ import annotations +from urllib.parse import urlparse + def strip_url_scheme(url: str) -> str: """``https://acme.example.com/`` → ``acme.example.com``. @@ -16,6 +18,10 @@ def strip_url_scheme(url: str) -> str: after trailing-slash strip), ``http://`` schemes (legacy clients don't all enforce https), and trailing slashes from sloppy concatenation. + + Does NOT strip URL paths or ports. Use ``extract_brand_domain`` + when the input may be a full URL (scheme + path) and you need only + the hostname component. """ s = url.strip() for prefix in ("https://", "http://"): @@ -23,3 +29,41 @@ def strip_url_scheme(url: str) -> str: s = s[len(prefix) :] break return s.rstrip("/") + + +def extract_brand_domain(url: str) -> str: + """Extract a ``BrandReference.domain``-safe hostname from a brand_manifest URL. + + v2.5 ``brand_manifest`` is documented as a URL to a JSON file + (e.g. ``"https://acme.com/.well-known/brand.json"``). The v3 + ``BrandReference.domain`` field requires a bare hostname matching + ``^[a-z0-9]([a-z0-9-]*[a-z0-9])?(\\.[a-z0-9]([a-z0-9-]*[a-z0-9])?)*$``. + + Behaviour by input shape: + + * Full URL with scheme (``https://acme.com/path``): + ``urlparse`` extracts the hostname (``"acme.com"``); path and + port are discarded. Uppercase schemes are normalised by + ``urlparse``; uppercase hostnames are lowercased by ``urlparse`` + (e.g. ``HTTPS://ACME.COM/path`` → ``"acme.com"``). + * URL with port (``https://acme.com:8443/path``): + hostname is extracted without the port (``"acme.com"``). + * Bare domain without scheme (``acme.com``): + ``urlparse`` returns ``hostname=None``; falls back to + ``strip_url_scheme`` which returns the input unchanged after + stripping any trailing slashes. + * IPv6 literal (``https://[::1]/path``): + ``urlparse`` returns ``hostname="::1"`` (brackets stripped). + This value does not satisfy ``BrandReference.domain``'s regex; + callers must validate the result if IPv6 addresses are possible. + + The caller is responsible for ensuring ``url`` is non-empty before + calling (both adapter call sites already gate on + ``isinstance(manifest, str) and manifest``). + """ + s = url.strip() + # urlparse correctly handles full URLs: extracts hostname, drops path and port. + # For bare domains (no scheme), urlparse treats the input as a path-only URI + # and returns hostname=None, so we fall back to strip_url_scheme. + hostname = urlparse(s).hostname + return hostname if hostname is not None else strip_url_scheme(s) diff --git a/src/adcp/compat/legacy/v2_5/get_products.py b/src/adcp/compat/legacy/v2_5/get_products.py index 2ea88ce2..caf37fed 100644 --- a/src/adcp/compat/legacy/v2_5/get_products.py +++ b/src/adcp/compat/legacy/v2_5/get_products.py @@ -40,7 +40,7 @@ from adcp.compat.legacy import register_adapter from adcp.compat.legacy.types import AdapterPair -from adcp.compat.legacy.v2_5._url import strip_url_scheme +from adcp.compat.legacy.v2_5._url import extract_brand_domain # v2.5 channel buckets to v3 channel slugs. Multi-mapped buckets resolve # to all listed v3 channels; downstream consumers can narrow further via @@ -127,10 +127,14 @@ def adapt_request(payload: dict[str, Any]) -> dict[str, Any]: """Translate a v2.5 ``get_products`` request to v3 shape.""" out = dict(payload) - # brand_manifest (v2.5 URL string) → brand.domain (v3 BrandReference) + # brand_manifest (v2.5 URL string) → brand.domain (v3 BrandReference). + # v2.5 spec documents brand_manifest as a URL to a JSON file, so paths + # are the expected input shape. extract_brand_domain uses urlparse to + # isolate the hostname so the result satisfies BrandReference.domain's + # hostname-only regex. brand_manifest = out.pop("brand_manifest", None) if isinstance(brand_manifest, str) and brand_manifest and "brand" not in out: - out["brand"] = {"domain": strip_url_scheme(brand_manifest)} + out["brand"] = {"domain": extract_brand_domain(brand_manifest)} # promoted_offerings → catalog promoted = out.pop("promoted_offerings", None) diff --git a/tests/test_legacy_adapter_v2_5_get_products.py b/tests/test_legacy_adapter_v2_5_get_products.py index 7fa873dd..71ace69a 100644 --- a/tests/test_legacy_adapter_v2_5_get_products.py +++ b/tests/test_legacy_adapter_v2_5_get_products.py @@ -57,6 +57,20 @@ def test_brand_manifest_strips_trailing_slash() -> None: assert out["brand"] == {"domain": "acme.example.com"} +def test_brand_manifest_with_path_extracts_hostname() -> None: + """Regression for #677: brand_manifest URL with a path must extract only + the hostname so the result satisfies BrandReference.domain's regex.""" + out = _adapt({"brand_manifest": "https://acme.com/.well-known/brand.json"}) + assert out["brand"] == {"domain": "acme.com"} + assert "brand_manifest" not in out + + +def test_brand_manifest_with_port_drops_port() -> None: + """Port numbers are stripped; only the hostname reaches brand.domain.""" + out = _adapt({"brand_manifest": "https://acme.com:8443/.well-known/brand.json"}) + assert out["brand"] == {"domain": "acme.com"} + + def test_brand_field_takes_precedence_over_manifest() -> None: """Half-migrated buyer sends both — keep the v3 field.""" out = _adapt( diff --git a/tests/test_legacy_adapter_v2_5_media_buy.py b/tests/test_legacy_adapter_v2_5_media_buy.py index 872f8974..d96062c3 100644 --- a/tests/test_legacy_adapter_v2_5_media_buy.py +++ b/tests/test_legacy_adapter_v2_5_media_buy.py @@ -44,6 +44,20 @@ def test_create_media_buy_brand_manifest_becomes_brand_domain() -> None: assert "brand_manifest" not in out +def test_create_media_buy_brand_manifest_with_path_extracts_hostname() -> None: + """Regression for #677: brand_manifest URL with a path must extract only + the hostname so the result satisfies BrandReference.domain's regex.""" + out = v2_5_cmb.adapt_request({"brand_manifest": "https://acme.com/.well-known/brand.json"}) + assert out["brand"] == {"domain": "acme.com"} + assert "brand_manifest" not in out + + +def test_create_media_buy_brand_manifest_with_port_drops_port() -> None: + """Port numbers are stripped; only the hostname reaches brand.domain.""" + out = v2_5_cmb.adapt_request({"brand_manifest": "https://acme.com:8443/.well-known/brand.json"}) + assert out["brand"] == {"domain": "acme.com"} + + def test_create_media_buy_brand_wins_over_manifest_when_both_present() -> None: out = v2_5_cmb.adapt_request( {