From 7f84eb27a1a4758e1ca53e8937ccdeb3ab0a83de Mon Sep 17 00:00:00 2001 From: Corey Pyle Date: Wed, 9 Jul 2025 17:08:51 -0400 Subject: [PATCH 1/2] Add packages option to run-ailly --- .../lliam/domain/commands.py | 1 + .../lliam/entry_points/lliam_app.py | 22 ++++---- .../lliam/service_layer/messagebus.py | 8 +-- .../lliam/service_layer/run_ailly.py | 54 ++++++++++++++----- .../lliam/service_layer/update_doc_gen.py | 2 +- .../lliam/test/update_doc_gen_test.py | 2 +- 6 files changed, 57 insertions(+), 32 deletions(-) diff --git a/aws_doc_sdk_examples_tools/lliam/domain/commands.py b/aws_doc_sdk_examples_tools/lliam/domain/commands.py index d9c89cf..2f52d10 100644 --- a/aws_doc_sdk_examples_tools/lliam/domain/commands.py +++ b/aws_doc_sdk_examples_tools/lliam/domain/commands.py @@ -17,6 +17,7 @@ class CreatePrompts(Command): @dataclass class RunAilly(Command): batches: List[str] + packages: List[str] @dataclass diff --git a/aws_doc_sdk_examples_tools/lliam/entry_points/lliam_app.py b/aws_doc_sdk_examples_tools/lliam/entry_points/lliam_app.py index 4dccb76..560b268 100644 --- a/aws_doc_sdk_examples_tools/lliam/entry_points/lliam_app.py +++ b/aws_doc_sdk_examples_tools/lliam/entry_points/lliam_app.py @@ -5,10 +5,7 @@ import logging import typer -from aws_doc_sdk_examples_tools.lliam.config import ( - AILLY_DIR, - BATCH_PREFIX -) +from aws_doc_sdk_examples_tools.lliam.config import AILLY_DIR, BATCH_PREFIX from aws_doc_sdk_examples_tools.lliam.domain import commands from aws_doc_sdk_examples_tools.lliam.service_layer import messagebus, unit_of_work @@ -38,18 +35,21 @@ def create_prompts(iam_tributary_root: str, system_prompts: List[str] = []): def run_ailly( batches: Annotated[ Optional[str], - typer.Option( - help="Batch names to process (comma-separated list)" - ), + typer.Option(help="Batch names to process (comma-separated list)"), + ] = None, + packages: Annotated[ + Optional[str], typer.Option(help="Comma delimited list of packages to update") ] = None, ) -> None: """ Run ailly to generate IAM policy content and process the results. If batches is specified, only those batches will be processed. If batches is omitted, all batches will be processed. + If packages is specified, only those packages will be processed. """ requested_batches = parse_batch_names(batches) - cmd = commands.RunAilly(batches=requested_batches) + package_names = parse_package_names(packages) + cmd = commands.RunAilly(batches=requested_batches, packages=package_names) messagebus.handle(cmd) @@ -58,9 +58,7 @@ def update_reservoir( iam_tributary_root: str, batches: Annotated[ Optional[str], - typer.Option( - help="Batch names to process (comma-separated list)" - ), + typer.Option(help="Batch names to process (comma-separated list)"), ] = None, packages: Annotated[ Optional[str], typer.Option(help="Comma delimited list of packages to update") @@ -86,7 +84,7 @@ def parse_batch_names(batch_names_str: Optional[str]) -> List[str]: """ if not batch_names_str: return [] - + batch_names = [] for name in batch_names_str.split(","): diff --git a/aws_doc_sdk_examples_tools/lliam/service_layer/messagebus.py b/aws_doc_sdk_examples_tools/lliam/service_layer/messagebus.py index 4e0b7eb..8b9bf14 100644 --- a/aws_doc_sdk_examples_tools/lliam/service_layer/messagebus.py +++ b/aws_doc_sdk_examples_tools/lliam/service_layer/messagebus.py @@ -12,9 +12,7 @@ Message = commands.Command -def handle( - message: commands.Command, uow: Optional[unit_of_work.FsUnitOfWork] = None -): +def handle(message: commands.Command, uow: Optional[unit_of_work.FsUnitOfWork] = None): queue = [message] while queue: @@ -25,9 +23,7 @@ def handle( raise Exception(f"{message} was not a Command") -def handle_command( - command: commands.Command, uow: Optional[unit_of_work.FsUnitOfWork] -): +def handle_command(command: commands.Command, uow: Optional[unit_of_work.FsUnitOfWork]): handler = COMMAND_HANDLERS[type(command)] handler(command, uow) diff --git a/aws_doc_sdk_examples_tools/lliam/service_layer/run_ailly.py b/aws_doc_sdk_examples_tools/lliam/service_layer/run_ailly.py index 4004b2f..4306131 100644 --- a/aws_doc_sdk_examples_tools/lliam/service_layer/run_ailly.py +++ b/aws_doc_sdk_examples_tools/lliam/service_layer/run_ailly.py @@ -1,5 +1,6 @@ import json import logging +import sys import time from collections import defaultdict from datetime import timedelta @@ -13,6 +14,14 @@ BATCH_PREFIX, ) +AILLY_CMD_BASE = [ + "ailly", + "--max-depth", + "10", + "--root", + str(AILLY_DIR_PATH), +] + logger = logging.getLogger(__file__) @@ -23,7 +32,7 @@ def handle_run_ailly(cmd: RunAilly, uow: None): total_start_time = time.time() for batch in resolved_batches: - run_ailly_single_batch(batch) + run_ailly_single_batch(batch, cmd.packages) total_end_time = time.time() total_duration = total_end_time - total_start_time @@ -56,19 +65,27 @@ def resolve_requested_batches(batch_names: List[str]) -> List[Path]: return batch_paths -def run_ailly_single_batch(batch: Path) -> None: +def run_ailly_single_batch(batch: Path, packages: List[str] = []) -> None: """Run ailly and process files for a single batch.""" batch_start_time = time.time() iam_updates_path = AILLY_DIR_PATH / f"updates_{batch.name}.json" - cmd = [ - "ailly", - "--max-depth", - "10", - "--root", - str(AILLY_DIR_PATH), - batch.name, - ] + if packages: + paths = [] + for package in packages: + package_files = [ + f"{batch.name}/{p.name}" for p in batch.glob(f"*{package}*.md") + ] + paths.extend(package_files) + + if not paths: + logger.error(f"No matching files found for packages: {packages}") + sys.exit(1) + + cmd = AILLY_CMD_BASE + paths + else: + cmd = AILLY_CMD_BASE + [batch.name] + logger.info(f"Running {cmd}") run(cmd) @@ -79,7 +96,9 @@ def run_ailly_single_batch(batch: Path) -> None: ) logger.info(f"Processing generated content for {batch.name}") - process_ailly_files(input_dir=batch, output_file=iam_updates_path) + process_ailly_files( + input_dir=batch, output_file=iam_updates_path, packages=packages + ) EXPECTED_KEYS: Set[str] = set(["title", "title_abbrev"]) @@ -177,7 +196,10 @@ def parse_package_name(policy_update: Dict[str, str]) -> Optional[str]: def process_ailly_files( - input_dir: Path, output_file: Path, file_pattern: str = "*.md.ailly.md" + input_dir: Path, + output_file: Path, + file_pattern: str = "*.md.ailly.md", + packages: List[str] = [], ) -> None: """ Process all .md.ailly.md files in the input directory and write the results as JSON to the output file. @@ -186,6 +208,7 @@ def process_ailly_files( input_dir: Directory containing .md.ailly.md files output_file: Path to the output JSON file file_pattern: Pattern to match files (default: "*.md.ailly.md") + packages: Optional list of packages to filter by """ results = defaultdict(list) @@ -197,6 +220,13 @@ def process_ailly_files( package_name = parse_package_name(policy_update) if not package_name: raise TypeError(f"Could not get package name from policy update.") + + if packages and package_name not in packages: + logger.info( + f"Skipping package {package_name} (not in requested packages)" + ) + continue + results[package_name].append(policy_update) with open(output_file, "w", encoding="utf-8") as out_file: diff --git a/aws_doc_sdk_examples_tools/lliam/service_layer/update_doc_gen.py b/aws_doc_sdk_examples_tools/lliam/service_layer/update_doc_gen.py index 1380b73..6fa3378 100644 --- a/aws_doc_sdk_examples_tools/lliam/service_layer/update_doc_gen.py +++ b/aws_doc_sdk_examples_tools/lliam/service_layer/update_doc_gen.py @@ -48,7 +48,7 @@ def make_title_abbreviation(old: Example, new: Example, abbreviations: Counter): version = language.versions[0] source = version.source source_title = source.title if source else "" - base = f"{new.title_abbrev} (from '{source_title}' docs)" + base = f"{new.title_abbrev} (from '{source_title}' guide)" abbreviations[base] += 1 count = abbreviations[base] return f"{base} ({count})" if count > 1 else base diff --git a/aws_doc_sdk_examples_tools/lliam/test/update_doc_gen_test.py b/aws_doc_sdk_examples_tools/lliam/test/update_doc_gen_test.py index a8e2b08..9d28392 100644 --- a/aws_doc_sdk_examples_tools/lliam/test/update_doc_gen_test.py +++ b/aws_doc_sdk_examples_tools/lliam/test/update_doc_gen_test.py @@ -44,5 +44,5 @@ def test_update_examples_title_abbrev(doc_gen_tributary: DocGen): updated_example = doc_gen_tributary.examples["iam_policies_example"] assert ( updated_example.title_abbrev - == "Updated Title Abbrev (from 'AWS Account Management' docs)" + == "Updated Title Abbrev (from 'AWS Account Management' guide)" ) From d742071e837e22afb1a67eaf499d58eb70bdc4d1 Mon Sep 17 00:00:00 2001 From: Corey Pyle Date: Mon, 14 Jul 2025 11:21:34 -0400 Subject: [PATCH 2/2] Add better error handling to Lliam --- .../lliam/domain/errors.py | 12 + .../lliam/entry_points/lliam_app.py | 18 +- .../lliam/service_layer/messagebus.py | 5 +- .../lliam/service_layer/run_ailly.py | 22 +- process_ailly_files.py | 249 ++++++++++++++++++ 5 files changed, 296 insertions(+), 10 deletions(-) create mode 100644 aws_doc_sdk_examples_tools/lliam/domain/errors.py create mode 100644 process_ailly_files.py diff --git a/aws_doc_sdk_examples_tools/lliam/domain/errors.py b/aws_doc_sdk_examples_tools/lliam/domain/errors.py new file mode 100644 index 0000000..54a0726 --- /dev/null +++ b/aws_doc_sdk_examples_tools/lliam/domain/errors.py @@ -0,0 +1,12 @@ +from dataclasses import dataclass + + +@dataclass +class DomainError: + pass + + +@dataclass +class CommandExecutionError(DomainError): + command_name: str + message: str diff --git a/aws_doc_sdk_examples_tools/lliam/entry_points/lliam_app.py b/aws_doc_sdk_examples_tools/lliam/entry_points/lliam_app.py index 560b268..ac67686 100644 --- a/aws_doc_sdk_examples_tools/lliam/entry_points/lliam_app.py +++ b/aws_doc_sdk_examples_tools/lliam/entry_points/lliam_app.py @@ -6,7 +6,7 @@ import typer from aws_doc_sdk_examples_tools.lliam.config import AILLY_DIR, BATCH_PREFIX -from aws_doc_sdk_examples_tools.lliam.domain import commands +from aws_doc_sdk_examples_tools.lliam.domain import commands, errors from aws_doc_sdk_examples_tools.lliam.service_layer import messagebus, unit_of_work logging.basicConfig( @@ -28,7 +28,8 @@ def create_prompts(iam_tributary_root: str, system_prompts: List[str] = []): out_dir=AILLY_DIR, ) uow = unit_of_work.FsUnitOfWork() - messagebus.handle(cmd, uow) + errors = messagebus.handle(cmd, uow) + handle_domain_errors(errors) @app.command() @@ -50,7 +51,8 @@ def run_ailly( requested_batches = parse_batch_names(batches) package_names = parse_package_names(packages) cmd = commands.RunAilly(batches=requested_batches, packages=package_names) - messagebus.handle(cmd) + errors = messagebus.handle(cmd) + handle_domain_errors(errors) @app.command() @@ -75,7 +77,15 @@ def update_reservoir( cmd = commands.UpdateReservoir( root=doc_gen_root, batches=batch_names, packages=package_names ) - messagebus.handle(cmd) + errors = messagebus.handle(cmd) + handle_domain_errors(errors) + + +def handle_domain_errors(errors: List[errors.DomainError]): + if errors: + for error in errors: + logger.error(error) + typer.Exit(code=1) def parse_batch_names(batch_names_str: Optional[str]) -> List[str]: diff --git a/aws_doc_sdk_examples_tools/lliam/service_layer/messagebus.py b/aws_doc_sdk_examples_tools/lliam/service_layer/messagebus.py index 8b9bf14..2579ad9 100644 --- a/aws_doc_sdk_examples_tools/lliam/service_layer/messagebus.py +++ b/aws_doc_sdk_examples_tools/lliam/service_layer/messagebus.py @@ -18,14 +18,15 @@ def handle(message: commands.Command, uow: Optional[unit_of_work.FsUnitOfWork] = while queue: message = queue.pop(0) if isinstance(message, commands.Command): - handle_command(message, uow) + return handle_command(message, uow) else: raise Exception(f"{message} was not a Command") def handle_command(command: commands.Command, uow: Optional[unit_of_work.FsUnitOfWork]): handler = COMMAND_HANDLERS[type(command)] - handler(command, uow) + errors = handler(command, uow) + return errors COMMAND_HANDLERS: Dict[Type[commands.Command], Callable] = { diff --git a/aws_doc_sdk_examples_tools/lliam/service_layer/run_ailly.py b/aws_doc_sdk_examples_tools/lliam/service_layer/run_ailly.py index 4306131..8d9ef25 100644 --- a/aws_doc_sdk_examples_tools/lliam/service_layer/run_ailly.py +++ b/aws_doc_sdk_examples_tools/lliam/service_layer/run_ailly.py @@ -1,14 +1,18 @@ import json import logging -import sys import time from collections import defaultdict +from dataclasses import dataclass from datetime import timedelta from pathlib import Path from subprocess import run from typing import Any, Dict, List, Optional, Set from aws_doc_sdk_examples_tools.lliam.domain.commands import RunAilly +from aws_doc_sdk_examples_tools.lliam.domain.errors import ( + CommandExecutionError, + DomainError, +) from aws_doc_sdk_examples_tools.lliam.config import ( AILLY_DIR_PATH, BATCH_PREFIX, @@ -28,11 +32,20 @@ def handle_run_ailly(cmd: RunAilly, uow: None): resolved_batches = resolve_requested_batches(cmd.batches) + errors: List[DomainError] = [] + if resolved_batches: total_start_time = time.time() for batch in resolved_batches: - run_ailly_single_batch(batch, cmd.packages) + try: + run_ailly_single_batch(batch, cmd.packages) + except FileNotFoundError as e: + errors.append( + CommandExecutionError( + command_name=cmd.__class__.__name__, message=str(e) + ) + ) total_end_time = time.time() total_duration = total_end_time - total_start_time @@ -41,6 +54,8 @@ def handle_run_ailly(cmd: RunAilly, uow: None): f"[TIMECHECK] {num_batches} batches took {format_duration(total_duration)} to run" ) + return errors + def resolve_requested_batches(batch_names: List[str]) -> List[Path]: if not batch_names: @@ -79,8 +94,7 @@ def run_ailly_single_batch(batch: Path, packages: List[str] = []) -> None: paths.extend(package_files) if not paths: - logger.error(f"No matching files found for packages: {packages}") - sys.exit(1) + raise FileNotFoundError(f"No matching files found for packages: {packages}") cmd = AILLY_CMD_BASE + paths else: diff --git a/process_ailly_files.py b/process_ailly_files.py new file mode 100644 index 0000000..25ed750 --- /dev/null +++ b/process_ailly_files.py @@ -0,0 +1,249 @@ +#!/usr/bin/env python3 +""" +Script to process .md and .ailly.md file pairs and generate JSONL output. + +This script: +1. Collects file data from .md and .ailly.md pairs +2. Extracts content without front matter +3. Generates JSONL output with prompt and model responses +""" + +import os +import json +import re +import argparse +from pathlib import Path +from typing import Dict, List, Tuple, Optional + + +def collect_file_pairs( + directory: str, limit: Optional[int] = None +) -> List[Tuple[str, str]]: + """ + Collect pairs of .md and .ailly.md files from the specified directory. + + Args: + directory: Path to the directory containing the files + limit: Optional limit on the number of pairs to process + + Returns: + List of tuples containing (md_file_path, ailly_md_file_path) + """ + md_files = {} + ailly_md_files = {} + + # Walk through the directory and collect all .md and .ailly.md files + for root, _, files in os.walk(directory): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".md") and not file.endswith(".ailly.md"): + md_files[file] = file_path + elif file.endswith(".ailly.md"): + base_name = file[:-9] # Remove '.ailly.md' + ailly_md_files[base_name] = file_path + + # Match the pairs + pairs = [] + for base_name, md_path in md_files.items(): + if base_name in ailly_md_files: + pairs.append((md_path, ailly_md_files[base_name])) + + # Apply limit if specified + if limit is not None and limit > 0: + pairs = pairs[:limit] + + return pairs + + +def extract_content(file_path: str) -> str: + """ + Extract content from a file, removing any front matter. + + Args: + file_path: Path to the file + + Returns: + Content of the file without front matter + """ + with open(file_path, "r", encoding="utf-8") as f: + content = f.read() + + # Remove front matter if it exists (content between --- markers) + front_matter_pattern = r"^---\n.*?\n---\n" + content = re.sub(front_matter_pattern, "", content, flags=re.DOTALL) + + return content.strip() + + +def get_aillyrc_content(directory: str) -> str: + """ + Get the content of the .aillyrc file without the front matter. + + Args: + directory: Directory containing the .aillyrc file + + Returns: + Content of the .aillyrc file without front matter + """ + # Find the .aillyrc file by going up directories if needed + current_dir = directory + aillyrc_path = None + + while current_dir and current_dir != "/": + potential_path = os.path.join(current_dir, ".aillyrc") + if os.path.exists(potential_path): + aillyrc_path = potential_path + break + current_dir = os.path.dirname(current_dir) + + if not aillyrc_path: + raise FileNotFoundError("Could not find .aillyrc file") + + return extract_content(aillyrc_path) + + +def extract_model_identifier(ailly_file_path: str) -> Dict: + """ + Extract model identifier from the .ailly.md file's front matter. + + Args: + ailly_file_path: Path to the .ailly.md file + + Returns: + Dictionary containing model identifier information + """ + with open(ailly_file_path, "r", encoding="utf-8") as f: + content = f.read() + + # Extract front matter + front_matter_match = re.search(r"^---\n(.*?)\n---\n", content, re.DOTALL) + if not front_matter_match: + return {} + + front_matter = front_matter_match.group(1) + + # Extract debug information + debug_match = re.search(r"debug:\s*\n(.*?)(\n\w|$)", front_matter, re.DOTALL) + if not debug_match: + return {} + + debug_content = debug_match.group(1) + + # Extract model information + model_match = re.search(r"model:\s*(.*?)$", debug_content, re.MULTILINE) + region_match = re.search(r"region:\s*(.*?)$", debug_content, re.MULTILINE) + + model_identifier = {} + if model_match: + model_identifier["model"] = model_match.group(1).strip() + if region_match: + model_identifier["region"] = region_match.group(1).strip() + + return model_identifier + + +def convert_to_jsonl_format( + file_pairs: List[Tuple[str, str]], aillyrc_content: str +) -> List[Dict]: + """ + Convert file pairs to JSONL format. + + Args: + file_pairs: List of (md_file_path, ailly_md_file_path) tuples + aillyrc_content: Content of the .aillyrc file + + Returns: + List of dictionaries in the required format + """ + jsonl_entries = [] + + for md_path, ailly_md_path in file_pairs: + # Extract content from files + md_content = extract_content(md_path) + ailly_md_content = extract_content(ailly_md_path) + + # Extract model identifier + model_identifier = extract_model_identifier(ailly_md_path) + + # Create JSONL entry + entry = { + "prompt": aillyrc_content + "\n\n" + md_content, + "modelResponses": [ + {"response": ailly_md_content, "modelIdentifier": model_identifier} + ], + } + + jsonl_entries.append(entry) + + return jsonl_entries + + +def write_jsonl_file(entries: List[Dict], output_path: str) -> None: + """ + Write entries to a JSONL file. + + Args: + entries: List of dictionaries to write + output_path: Path to the output file + """ + with open(output_path, "w", encoding="utf-8") as f: + for entry in entries: + f.write(json.dumps(entry) + "\n") + + +def main(): + """Main function to process files and generate JSONL output.""" + parser = argparse.ArgumentParser( + description="Process .md and .ailly.md file pairs and generate JSONL output." + ) + parser.add_argument( + "--directory", + "-d", + type=str, + default=".ailly_iam_policy/batch_01", + help="Directory containing the file pairs", + ) + parser.add_argument( + "--output", + "-o", + type=str, + default="output.jsonl", + help="Path to the output JSONL file", + ) + parser.add_argument( + "--limit", + "-n", + type=int, + default=None, + help="Limit the number of file pairs to process", + ) + + args = parser.parse_args() + + # Resolve paths + base_dir = os.path.dirname(os.path.abspath(__file__)) + directory = os.path.join(base_dir, args.directory) + output_path = os.path.join(base_dir, args.output) + + # Step 1: Collect file pairs + print(f"Collecting file pairs from {directory}...") + file_pairs = collect_file_pairs(directory, args.limit) + print(f"Found {len(file_pairs)} file pairs.") + + # Step 2: Get .aillyrc content + print("Reading .aillyrc content...") + aillyrc_content = get_aillyrc_content(directory) + + # Step 3: Convert to JSONL format + print("Converting to JSONL format...") + jsonl_entries = convert_to_jsonl_format(file_pairs, aillyrc_content) + + # Step 4: Write to output file + print(f"Writing {len(jsonl_entries)} entries to {output_path}...") + write_jsonl_file(jsonl_entries, output_path) + + print("Done!") + + +if __name__ == "__main__": + main()