From ea3978889f07bf48e13db8e2a3f82135ff175d04 Mon Sep 17 00:00:00 2001 From: Corey Pyle Date: Mon, 18 Aug 2025 15:09:35 -0400 Subject: [PATCH 1/3] Fix dedupe_reservoir issues. When dedupe_reservoir was run, it was not accounting for title_abbrev fields that were already appended with a number. This change compares the field values without any appended numbers, and then appends a new number based on the count. Initially I wanted to just pick up where the count left off. However, it turned out to be possible to generate duplicate titles on subsequent runs, so there was no way to determine what the 'original' duplicate was. --- aws_doc_sdk_examples_tools/lliam/__init__.py | 0 .../lliam/service_layer/dedupe_reservoir.py | 79 +++++++++--- .../lliam/service_layer/run_ailly.py | 1 + .../lliam/test/dedupe_reservoir_test.py | 122 ++++++++++++++++++ aws_doc_sdk_examples_tools/metadata.py | 2 +- 5 files changed, 186 insertions(+), 18 deletions(-) create mode 100644 aws_doc_sdk_examples_tools/lliam/__init__.py create mode 100644 aws_doc_sdk_examples_tools/lliam/test/dedupe_reservoir_test.py diff --git a/aws_doc_sdk_examples_tools/lliam/__init__.py b/aws_doc_sdk_examples_tools/lliam/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/aws_doc_sdk_examples_tools/lliam/service_layer/dedupe_reservoir.py b/aws_doc_sdk_examples_tools/lliam/service_layer/dedupe_reservoir.py index 3c0106b..2885abe 100644 --- a/aws_doc_sdk_examples_tools/lliam/service_layer/dedupe_reservoir.py +++ b/aws_doc_sdk_examples_tools/lliam/service_layer/dedupe_reservoir.py @@ -1,7 +1,10 @@ +import logging +import re + from collections import Counter from dataclasses import replace -import logging -from typing import Dict +from pathlib import Path +from typing import Dict, Iterable, List from aws_doc_sdk_examples_tools.doc_gen import DocGen from aws_doc_sdk_examples_tools.lliam.domain.commands import DedupeReservoir @@ -12,33 +15,75 @@ logger = logging.getLogger(__name__) -def make_title_abbreviation(example: Example, counter: Counter): +def make_abbrev(example: Example, counter: Counter) -> str: + if not example.title_abbrev: + return "" + count = counter[example.title_abbrev] abbrev = f"{example.title_abbrev} ({count + 1})" if count else example.title_abbrev counter[example.title_abbrev] += 1 return abbrev -def handle_dedupe_reservoir(cmd: DedupeReservoir, uow: None): - doc_gen = DocGen.from_root(cmd.root, validation=ValidationConfig(check_aws=False)) +def reset_abbrev_count(examples: Dict[str, Example]) -> Dict[str, Example]: + """ + Reset all duplicate title abbreviations back to their un-enumerated state. + + I don't love this. Ideally we would only update new title_abbrev fields + with the incremented count. But there's no way to know which ones are new + or even which particular title_abbrev is the original. - examples: Dict[str, Example] = {} + Ex. + title_abbrev: some policy + title_abbrev: some policy (2) + title_abbrev: some policy + title_abbrev: some policy - for id, example in doc_gen.examples.items(): - if cmd.packages and example.file: - package = example.file.name.split("_metadata.yaml")[0] - if package in cmd.packages: - examples[id] = example - else: - examples[id] = example + Which one is the original? Which ones are new? + """ - title_abbrev_counts: Counter = Counter() + updated_examples = {} for id, example in examples.items(): - examples[id] = replace( + updated_examples[id] = replace( example, - title_abbrev=make_title_abbreviation(example, title_abbrev_counts), + title_abbrev=re.sub(r"(\s\(\d+\))*$", "", example.title_abbrev or ""), ) + return updated_examples + + +def example_in_packages(example: Example, packages: List[str]) -> bool: + if packages and example.file: + example_pkg_name = example.file.name.split("_metadata.yaml")[0] + if not example_pkg_name in packages: + return False + return True + + +def dedupe_examples( + examples: Dict[str, Example], packages: List[str] +) -> Dict[str, Example]: + filtered = { + id: ex for id, ex in examples.items() if example_in_packages(ex, packages) + } + + reset_examples = reset_abbrev_count(filtered) + + counter = Counter() + + return { + id: replace(ex, title_abbrev=make_abbrev(ex, counter)) + for id, ex in reset_examples.items() + } + + +def write_examples(examples: Dict[str, Example], root: Path): writes = prepare_write(examples) - write_many(cmd.root, writes) + write_many(root, writes) + + +def handle_dedupe_reservoir(cmd: DedupeReservoir, uow: None): + doc_gen = DocGen.from_root(cmd.root, validation=ValidationConfig(check_aws=False)) + examples = dedupe_examples(doc_gen.examples, cmd.packages) + write_examples(examples, cmd.root) diff --git a/aws_doc_sdk_examples_tools/lliam/service_layer/run_ailly.py b/aws_doc_sdk_examples_tools/lliam/service_layer/run_ailly.py index bc6a651..fe45a71 100644 --- a/aws_doc_sdk_examples_tools/lliam/service_layer/run_ailly.py +++ b/aws_doc_sdk_examples_tools/lliam/service_layer/run_ailly.py @@ -22,6 +22,7 @@ "ailly", "--max-depth", "10", + "--no-overwrite", "--root", str(AILLY_DIR_PATH), ] diff --git a/aws_doc_sdk_examples_tools/lliam/test/dedupe_reservoir_test.py b/aws_doc_sdk_examples_tools/lliam/test/dedupe_reservoir_test.py new file mode 100644 index 0000000..983c137 --- /dev/null +++ b/aws_doc_sdk_examples_tools/lliam/test/dedupe_reservoir_test.py @@ -0,0 +1,122 @@ +from collections import Counter +from pathlib import Path + +from aws_doc_sdk_examples_tools.metadata import Example +from aws_doc_sdk_examples_tools.lliam.service_layer.dedupe_reservoir import ( + make_abbrev, + example_in_packages, + reset_abbrev_count, + dedupe_examples, +) + + +def test_make_abbrev_continues_numbering(): + """Test that numbering continues from existing numbered titles.""" + counter = Counter({"Some abbrev": 2}) + example = Example(id="test", file=Path(), languages={}, title_abbrev="Some abbrev") + result = make_abbrev(example, counter) + + assert result == "Some abbrev (3)" + + +def test_make_abbrev_first_occurrence(): + """Test that first occurrence doesn't get numbered.""" + counter = Counter() + example = Example(id="test", file=Path(), languages={}, title_abbrev="New abbrev") + result = make_abbrev(example, counter) + + assert result == "New abbrev" + assert counter["New abbrev"] == 1 + + +def test_example_in_packages_no_packages(): + """Test that example is included when no packages specified.""" + example = Example(id="test", file=Path("test_metadata.yaml"), languages={}) + result = example_in_packages(example, []) + + assert result is True + + +def test_example_in_packages_matching_package(): + """Test that example is included when package matches.""" + example = Example(id="test", file=Path("pkg1_metadata.yaml"), languages={}) + result = example_in_packages(example, ["pkg1", "pkg2"]) + + assert result is True + + +def test_example_in_packages_non_matching_package(): + """Test that example is excluded when package doesn't match.""" + example = Example(id="test", file=Path("pkg3_metadata.yaml"), languages={}) + result = example_in_packages(example, ["pkg1", "pkg2"]) + + assert result is False + + +def test_build_abbrev_counter(): + """Test building counter from examples with existing numbered titles.""" + examples = { + "1": Example(id="1", file=Path(), languages={}, title_abbrev="Test (1)"), + "2": Example(id="2", file=Path(), languages={}, title_abbrev="Test (2)"), + "3": Example(id="3", file=Path(), languages={}, title_abbrev="Other"), + "4": Example(id="4", file=Path(), languages={}, title_abbrev="Test"), + } + + result = reset_abbrev_count(examples) + + assert result["1"].title_abbrev == "Test" + assert result["2"].title_abbrev == "Test" + assert result["3"].title_abbrev == "Other" + assert result["4"].title_abbrev == "Test" + + +def test_build_abbrev_counter_empty(): + """Test building counter from empty examples list.""" + result = reset_abbrev_count({}) + + assert len(result) == 0 + + +def test_dedupe_examples(): + """Test deduping examples with existing numbered titles.""" + examples = { + "ex1": Example( + id="ex1", + file=Path("pkg1_metadata.yaml"), + languages={}, + title_abbrev="Test (2) (2)", + ), + "ex2": Example( + id="ex2", + file=Path("pkg1_metadata.yaml"), + languages={}, + title_abbrev="Test (3) (3) (3)", + ), + "ex3": Example( + id="ex3", file=Path("pkg1_metadata.yaml"), languages={}, title_abbrev="Test" + ), + "ex4": Example( + id="ex4", file=Path("pkg1_metadata.yaml"), languages={}, title_abbrev="Test" + ), + "ex5": Example( + id="ex5", file=Path("pkg1_metadata.yaml"), languages={}, title_abbrev="Test" + ), + "ex6": Example( + id="ex6", file=Path("pkg2_metadata.yaml"), languages={}, title_abbrev="Test" + ), + } + + result = dedupe_examples(examples, []) + + assert len(result) == 6 + title_abbrevs = sorted( + [ex.title_abbrev for ex in result.values() if ex.title_abbrev] + ) + assert title_abbrevs == [ + "Test", + "Test (2)", + "Test (3)", + "Test (4)", + "Test (5)", + "Test (6)", + ] diff --git a/aws_doc_sdk_examples_tools/metadata.py b/aws_doc_sdk_examples_tools/metadata.py index 92a0dd9..84cb4be 100755 --- a/aws_doc_sdk_examples_tools/metadata.py +++ b/aws_doc_sdk_examples_tools/metadata.py @@ -139,7 +139,7 @@ def validate(self, errors: MetadataErrors, root: Path): @dataclass class Example: id: str - file: Optional[Path] + file: Path languages: Dict[str, Language] # Human readable title. title: Optional[str] = field(default="") From 989416e0ccbb074bed324a48dea8fbabb670831c Mon Sep 17 00:00:00 2001 From: Corey Pyle Date: Mon, 18 Aug 2025 15:38:04 -0400 Subject: [PATCH 2/3] Fix mypy warnings. --- aws_doc_sdk_examples_tools/agent/update_doc_gen.py | 2 +- .../lliam/service_layer/dedupe_reservoir.py | 2 +- .../lliam/service_layer/update_doc_gen.py | 3 +-- aws_doc_sdk_examples_tools/lliam/test/update_doc_gen_test.py | 2 +- aws_doc_sdk_examples_tools/yaml_mapper.py | 3 ++- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/aws_doc_sdk_examples_tools/agent/update_doc_gen.py b/aws_doc_sdk_examples_tools/agent/update_doc_gen.py index 50b445a..0a78ab4 100644 --- a/aws_doc_sdk_examples_tools/agent/update_doc_gen.py +++ b/aws_doc_sdk_examples_tools/agent/update_doc_gen.py @@ -26,7 +26,7 @@ def examples_from_updates(updates_path: Path) -> Iterable[Example]: examples = [ Example( id=id, - file=None, + file=Path(), languages={}, title=update.get("title"), title_abbrev=update.get("title_abbrev"), diff --git a/aws_doc_sdk_examples_tools/lliam/service_layer/dedupe_reservoir.py b/aws_doc_sdk_examples_tools/lliam/service_layer/dedupe_reservoir.py index 2885abe..a7a3107 100644 --- a/aws_doc_sdk_examples_tools/lliam/service_layer/dedupe_reservoir.py +++ b/aws_doc_sdk_examples_tools/lliam/service_layer/dedupe_reservoir.py @@ -70,7 +70,7 @@ def dedupe_examples( reset_examples = reset_abbrev_count(filtered) - counter = Counter() + counter: Counter = Counter() return { id: replace(ex, title_abbrev=make_abbrev(ex, counter)) diff --git a/aws_doc_sdk_examples_tools/lliam/service_layer/update_doc_gen.py b/aws_doc_sdk_examples_tools/lliam/service_layer/update_doc_gen.py index c8703cf..442f8d8 100644 --- a/aws_doc_sdk_examples_tools/lliam/service_layer/update_doc_gen.py +++ b/aws_doc_sdk_examples_tools/lliam/service_layer/update_doc_gen.py @@ -1,7 +1,6 @@ from dataclasses import replace import json import logging -from collections import Counter from pathlib import Path from typing import Dict, Iterable, List @@ -37,7 +36,7 @@ def examples_from_updates(updates: Updates) -> Iterable[Example]: examples = [ Example( id=id, - file=None, + file=Path(), languages={}, title=update.get("title"), title_abbrev=update.get("title_abbrev"), diff --git a/aws_doc_sdk_examples_tools/lliam/test/update_doc_gen_test.py b/aws_doc_sdk_examples_tools/lliam/test/update_doc_gen_test.py index 9d28392..4f5ca82 100644 --- a/aws_doc_sdk_examples_tools/lliam/test/update_doc_gen_test.py +++ b/aws_doc_sdk_examples_tools/lliam/test/update_doc_gen_test.py @@ -32,7 +32,7 @@ def test_update_examples_title_abbrev(doc_gen_tributary: DocGen): # Create an example with a title_abbrev to update update_example = Example( id="iam_policies_example", - file=None, + file=Path(), languages={}, title_abbrev="Updated Title Abbrev", ) diff --git a/aws_doc_sdk_examples_tools/yaml_mapper.py b/aws_doc_sdk_examples_tools/yaml_mapper.py index 2470c91..d2bbef3 100644 --- a/aws_doc_sdk_examples_tools/yaml_mapper.py +++ b/aws_doc_sdk_examples_tools/yaml_mapper.py @@ -1,6 +1,7 @@ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: Apache-2.0 +from pathlib import Path from typing import Dict, Set, Tuple, Any, List, Optional, Union from .metadata import ( Example, @@ -112,7 +113,7 @@ def example_from_yaml( return ( Example( id="", - file=None, + file=Path(), title=title, title_abbrev=title_abbrev, category=category, From d0ccb58e543ee58894719431bb99f8aee98c4ff5 Mon Sep 17 00:00:00 2001 From: Corey Pyle Date: Mon, 18 Aug 2025 15:40:42 -0400 Subject: [PATCH 3/3] Refactor package name parsing for readability. --- .../lliam/service_layer/dedupe_reservoir.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aws_doc_sdk_examples_tools/lliam/service_layer/dedupe_reservoir.py b/aws_doc_sdk_examples_tools/lliam/service_layer/dedupe_reservoir.py index a7a3107..8a56604 100644 --- a/aws_doc_sdk_examples_tools/lliam/service_layer/dedupe_reservoir.py +++ b/aws_doc_sdk_examples_tools/lliam/service_layer/dedupe_reservoir.py @@ -55,7 +55,7 @@ def reset_abbrev_count(examples: Dict[str, Example]) -> Dict[str, Example]: def example_in_packages(example: Example, packages: List[str]) -> bool: if packages and example.file: - example_pkg_name = example.file.name.split("_metadata.yaml")[0] + (example_pkg_name, *_) = example.file.name.split("_metadata.yaml") if not example_pkg_name in packages: return False return True