Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 92 additions & 6 deletions src/adcp/adagents.py
Original file line number Diff line number Diff line change
Expand Up @@ -949,16 +949,102 @@ def _resolve_agent_properties(
and {t for t in p.get("tags", []) if isinstance(t, str)} & authorized_tags
]

# Handle publisher_properties (cross-domain references)
# Handle publisher_properties: inline-resolution path per adcp#4827.
# For each selector, fan out over its domain(s), then try to satisfy from
# the parent file's top-level properties[] before considering a federated
# fetch. Federated fetch (per-domain HTTP) is a follow-up; this change
# fixes the primary bug of returning raw selector dicts instead of resolved
# property objects.
if authorization_type == "publisher_properties":
publisher_props = agent.get("publisher_properties", [])
if not isinstance(publisher_props, list):
selectors = agent.get("publisher_properties", [])
if not isinstance(selectors, list):
return []
return [p for p in publisher_props if isinstance(p, dict)]
# Pre-index parent properties by domain once — O(N) — so per-domain
# lookups are O(1) instead of O(N), avoiding O(N×M) at cafemedia scale
# (6,843 properties × 6,800 domains = 46 M ops without this index).
domain_index: dict[str, list[dict[str, Any]]] = {}
for p in top_level_properties:
if isinstance(p, dict):
d = p.get("publisher_domain")
if isinstance(d, str) and d:
domain_index.setdefault(d, []).append(p)
resolved: list[dict[str, Any]] = []
seen_ids: set[str | None] = set()
for selector in selectors:
if not isinstance(selector, dict):
continue
for domain in _selector_domains(selector):
inline = _resolve_inline(selector, domain_index, domain)
if inline is not None:
for prop in inline:
pid = prop.get("property_id")
if pid not in seen_ids:
seen_ids.add(pid)
resolved.append(prop)
# inline succeeded; skip federated fetch for this domain
# inline is None → no parent-file data for domain; federated
# fetch would go here (not yet implemented; see #749 Part 2).
return resolved

return []


def _selector_domains(selector: dict[str, Any]) -> list[str]:
"""Extract publisher domain(s) from a publisher_properties selector.

Handles both the scalar ``publisher_domain`` form and the compact
``publisher_domains[]`` array form from adcp#4827.
"""
domains = selector.get("publisher_domains")
if isinstance(domains, list):
return [d for d in domains if isinstance(d, str) and d]
domain = selector.get("publisher_domain")
if isinstance(domain, str) and domain:
return [domain]
return []


def _resolve_inline(
selector: dict[str, Any],
domain_index: dict[str, list[dict[str, Any]]],
domain: str,
) -> list[dict[str, Any]] | None:
"""Attempt to satisfy a selector from the parent file's inline properties.

``domain_index`` is a pre-built mapping of publisher_domain → property list
(built once per ``_resolve_agent_properties`` call for O(1) per-domain
lookup instead of O(N) linear scan).

Returns ``None`` when ``domain_index`` has no entry for ``domain`` — the
inline path has no data for this domain; a federated fetch would be next.
Returns ``[]`` when inline candidates exist but none pass the selector
filter — this is a real empty set; do NOT fall back.

Handles ``selection_type`` values: ``"all"``, ``"by_tag"``, ``"by_id"``.
Unknown types are treated permissively (return all domain candidates).
"""
candidates = domain_index.get(domain)
if not candidates:
return None # no inline data for this domain

selection_type = selector.get("selection_type", "all")
if selection_type == "all":
return list(candidates)
if selection_type == "by_tag":
required_tags = {t for t in selector.get("property_tags", []) if isinstance(t, str)}
if not required_tags:
return list(candidates)
return [
p for p in candidates
if required_tags & {t for t in p.get("tags", []) if isinstance(t, str)}
]
if selection_type == "by_id":
required_ids = {i for i in selector.get("property_ids", []) if isinstance(i, str)}
return [p for p in candidates if p.get("property_id") in required_ids]
# Unknown selection_type — permissive fallback
return list(candidates)


def get_all_properties(adagents_data: dict[str, Any]) -> list[dict[str, Any]]:
"""Extract all properties from adagents.json data.

Expand Down Expand Up @@ -1035,8 +1121,8 @@ def get_properties_by_agent(adagents_data: dict[str, Any], agent_url: str) -> li
- inline_properties: Properties defined directly in the agent's properties array
- property_ids: Filter top-level properties by property_id
- property_tags: Filter top-level properties by tags
- publisher_properties: References properties from other publisher domains
(returns the selector objects, not resolved properties)
- publisher_properties: Inline-resolved properties from other publisher
domains (resolved from the parent file's top-level properties[] array)

Args:
adagents_data: Parsed adagents.json data
Expand Down
177 changes: 171 additions & 6 deletions tests/test_adagents.py
Original file line number Diff line number Diff line change
Expand Up @@ -1338,8 +1338,34 @@ def test_get_properties_by_agent_property_tags_multiple(self):
assert properties[1]["name"] == "Site 2"

def test_get_properties_by_agent_publisher_properties(self):
"""Should return publisher_properties selectors for publisher_properties type."""
"""publisher_properties resolves inline properties, not raw selector dicts."""
adagents_data = {
"properties": [
{
"property_id": "ctv-001",
"publisher_domain": "cnn.com",
"name": "CNN CTV",
"tags": ["ctv"],
},
{
"property_id": "ctv-002",
"publisher_domain": "cnn.com",
"name": "CNN Sports CTV",
"tags": ["ctv", "sports"],
},
{
"property_id": "web-001",
"publisher_domain": "cnn.com",
"name": "CNN Web",
"tags": ["web"],
},
{
"property_id": "espn-001",
"publisher_domain": "espn.com",
"name": "ESPN Home",
"tags": ["sports"],
},
],
"authorized_agents": [
{
"url": "https://agent1.example.com",
Expand All @@ -1361,11 +1387,150 @@ def test_get_properties_by_agent_publisher_properties(self):
}

properties = get_properties_by_agent(adagents_data, "https://agent1.example.com")
assert len(properties) == 2
assert properties[0]["publisher_domain"] == "cnn.com"
assert properties[0]["selection_type"] == "by_tag"
assert properties[1]["publisher_domain"] == "espn.com"
assert properties[1]["selection_type"] == "all"
property_ids = {p["property_id"] for p in properties}
# by_tag "ctv": ctv-001 and ctv-002 match; web-001 does not
# all: espn-001 matches
assert property_ids == {"ctv-001", "ctv-002", "espn-001"}
# Must return resolved property dicts, not selector dicts
assert all("property_id" in p for p in properties)
assert not any("selection_type" in p for p in properties)

def test_get_properties_by_agent_publisher_domains_fanout(self):
"""publisher_domains[] compact form fans out to per-domain inline resolution."""
adagents_data = {
"properties": [
{
"property_id": "a-001",
"publisher_domain": "site-a.com",
"name": "Site A",
"tags": ["news"],
},
{
"property_id": "b-001",
"publisher_domain": "site-b.com",
"name": "Site B",
"tags": ["news"],
},
],
"authorized_agents": [
{
"url": "https://agent1.example.com",
"authorization_type": "publisher_properties",
"authorized_for": "Multi-domain",
"publisher_properties": [
{
"publisher_domains": ["site-a.com", "site-b.com"],
"selection_type": "all",
},
],
},
],
}

properties = get_properties_by_agent(adagents_data, "https://agent1.example.com")
assert {p["property_id"] for p in properties} == {"a-001", "b-001"}

def test_get_properties_by_agent_publisher_properties_no_inline(self):
"""When no parent-file properties exist for a domain, returns empty (no federated)."""
adagents_data = {
"properties": [],
"authorized_agents": [
{
"url": "https://agent1.example.com",
"authorization_type": "publisher_properties",
"authorized_for": "Cross-domain",
"publisher_properties": [
{
"publisher_domain": "external.com",
"selection_type": "all",
},
],
},
],
}

properties = get_properties_by_agent(adagents_data, "https://agent1.example.com")
assert properties == []

def test_get_properties_by_agent_publisher_properties_by_id(self):
"""publisher_properties with selection_type by_id filters by property_id."""
adagents_data = {
"properties": [
{"property_id": "ctv-001", "publisher_domain": "cnn.com", "name": "CNN CTV"},
{"property_id": "ctv-002", "publisher_domain": "cnn.com", "name": "CNN Web"},
],
"authorized_agents": [
{
"url": "https://agent1.example.com",
"authorization_type": "publisher_properties",
"authorized_for": "Specific properties",
"publisher_properties": [
{
"publisher_domain": "cnn.com",
"selection_type": "by_id",
"property_ids": ["ctv-001"],
},
],
},
],
}

properties = get_properties_by_agent(adagents_data, "https://agent1.example.com")
assert {p["property_id"] for p in properties} == {"ctv-001"}

def test_get_properties_by_agent_cafemedia_scale(self):
"""Cafemedia/interchange.io canonical fixture: 6,843 inline properties across
6,800 child domains, all raptive_managed, one authorized agent.

Sized to catch O(N×M) regressions — at this scale an unindexed
implementation (~46 M ops) would cause a multi-second timeout.
"""
# 6,800 child publisher domains (cafemedia fan-out shape)
child_domains = [f"site{i:04d}.raptive.com" for i in range(6800)]
properties: list[dict] = []
# One property per child domain
for i, domain in enumerate(child_domains):
properties.append({
"property_id": f"p-{i:05d}",
"publisher_domain": domain,
"name": f"Site {i} — Raptive Managed",
"tags": ["raptive_managed"],
})
# 43 extra properties on the first 43 domains (total: 6,843)
for i in range(43):
properties.append({
"property_id": f"extra-{i:03d}",
"publisher_domain": child_domains[i],
"name": f"Site {i} Extra Property",
"tags": ["raptive_managed", "ctv"],
})

adagents_data = {
"properties": properties,
"authorized_agents": [
{
"url": "https://interchange.io",
"authorization_type": "publisher_properties",
"authorized_for": "Raptive managed network",
"publisher_properties": [
{
"publisher_domains": child_domains,
"selection_type": "by_tag",
"property_tags": ["raptive_managed"],
}
],
}
],
}

result = get_properties_by_agent(adagents_data, "https://interchange.io")
assert len(result) == 6843
result_domains = {p["publisher_domain"] for p in result}
assert result_domains == set(child_domains) # all 6,800 must appear, not just a subset
assert all("raptive_managed" in p.get("tags", []) for p in result)
# Must return resolved property dicts, not selector dicts
assert all("property_id" in p for p in result)
assert not any("publisher_domains" in p for p in result)

def test_get_properties_by_agent_protocol_agnostic(self):
"""Should match agent URL regardless of protocol."""
Expand Down
Loading