From 1378f29764931239f5137e68000df15fd6f4c827 Mon Sep 17 00:00:00 2001 From: Vichy Meas Date: Thu, 28 May 2026 11:52:36 -0700 Subject: [PATCH 1/5] feat: add API Gateway cleanup phase to sweeper function Delete orphaned REST APIs matching sam-integ- pattern before stack deletion to avoid rate limit issues. Uses 35s sleep between API deletions per API Gateway control plane limits. Also unifies pattern matching to a single TEST_PATTERN and reduces cutoff to 6 hours. --- integration/setup/companion-stack.yaml | 113 ++++++++++++++----------- 1 file changed, 63 insertions(+), 50 deletions(-) diff --git a/integration/setup/companion-stack.yaml b/integration/setup/companion-stack.yaml index 5d4a39884..476437a1f 100644 --- a/integration/setup/companion-stack.yaml +++ b/integration/setup/companion-stack.yaml @@ -299,6 +299,10 @@ Resources: Resource: - !Sub "arn:${AWS::Partition}:s3:::*sam-integ-stack-*" - !Sub "arn:${AWS::Partition}:s3:::*sam-integ-stack-*/*" + - Effect: Allow + Action: + - apigateway:DELETE + Resource: !Sub "arn:${AWS::Partition}:apigateway:${AWS::Region}::/restapis/*" - Effect: Allow Action: - logs:DeleteLogGroup @@ -324,8 +328,7 @@ Resources: import boto3, time from datetime import datetime, timezone, timedelta - STACK_PATTERN = 'sam-integ-stack-' - IAM_PATTERN = 'sam-integ-' + TEST_PATTERN = 'sam-integ-' ELIGIBLE_STATUSES = [ 'CREATE_COMPLETE', 'ROLLBACK_COMPLETE', 'ROLLBACK_FAILED', 'REVIEW_IN_PROGRESS', 'DELETE_FAILED', 'UPDATE_FAILED', @@ -335,76 +338,88 @@ Resources: def _has_time(ctx): return ctx.get_remaining_time_in_millis() > 30000 - def _is_test(name, strict=False): - pattern = STACK_PATTERN if strict else IAM_PATTERN - return pattern in name and 'companion' not in name + def _is_test(name): + return TEST_PATTERN in name and 'companion' not in name def handler(event, ctx): cfn = boto3.client('cloudformation') iam = boto3.client('iam') logs = boto3.client('logs') - cutoff = datetime.now(timezone.utc) - timedelta(hours=24) - + apig = boto3.client('apigateway') + cutoff = datetime.now(timezone.utc) - timedelta(hours=6) + _sweep_apis(apig, cutoff, ctx) _sweep_stacks(cfn, iam, cutoff, ctx) - _sweep_log_groups(logs, cutoff, ctx) + _sweep_logs(logs, cutoff, ctx) + + def _sweep_apis(apig, cutoff, ctx): + for page in apig.get_paginator('get_rest_apis').paginate(): + for api in page['items']: + if not _has_time(ctx): + return + if not _is_test(api.get('name', '')): + continue + c = api.get('createdDate') + if c and c.replace(tzinfo=timezone.utc) >= cutoff: + continue + try: + print(f"Deleting API: {api['name']}") + apig.delete_rest_api(restApiId=api['id']) + time.sleep(35) + except Exception: pass def _sweep_stacks(cfn, iam, cutoff, ctx): - deleted = [] for page in cfn.get_paginator('list_stacks').paginate(StackStatusFilter=ELIGIBLE_STATUSES): - for stack in page['StackSummaries']: + for s in page['StackSummaries']: if not _has_time(ctx): - print(f"Attempt to delete ({len(deleted)}) stacks: {deleted}") return - name = stack['StackName'] - if not _is_test(name, strict=True): + name = s['StackName'] + if not _is_test(name): continue - if stack['CreationTime'].replace(tzinfo=timezone.utc) >= cutoff: + if s['CreationTime'].replace(tzinfo=timezone.utc) >= cutoff: continue - if stack['StackStatus'] == 'DELETE_FAILED': + if s['StackStatus'] == 'DELETE_FAILED': _fix_and_retry(cfn, iam, name) try: + print(f"Deleting: {name}") cfn.delete_stack(StackName=name) - deleted.append(name) time.sleep(1) - except Exception as e: - print(f"delete_stack {name}: {e}") - print(f"Attempt to delete ({len(deleted)}) stacks: {deleted}") + except Exception: pass - def _fix_and_retry(cfn, iam, stack_name): + def _fix_and_retry(cfn, iam, name): try: - for event in cfn.describe_stack_events(StackName=stack_name)['StackEvents']: - if event.get('ResourceStatus') != 'DELETE_FAILED': + for ev in cfn.describe_stack_events(StackName=name)['StackEvents']: + if ev.get('ResourceStatus') != 'DELETE_FAILED': continue - resource_type = event.get('ResourceType', '') - resource_id = event.get('PhysicalResourceId', '') - if not resource_id or not _is_test(resource_id): + rt = ev.get('ResourceType', '') + rid = ev.get('PhysicalResourceId', '') + if not rid or not _is_test(rid): continue - if resource_type == 'AWS::IAM::Role': - _force_delete_role(iam, resource_id) - elif resource_type == 'AWS::IAM::Policy': - _force_delete_policy(iam, resource_id) - elif resource_type == 'AWS::S3::Bucket': + if rt == 'AWS::IAM::Role': + _force_delete_role(iam, rid) + elif rt == 'AWS::IAM::Policy': + _force_delete_policy(iam, rid) + elif rt == 'AWS::S3::Bucket': try: - bucket = boto3.resource('s3').Bucket(resource_id) - bucket.object_versions.delete() - bucket.objects.delete() + b = boto3.resource('s3').Bucket(rid) + b.object_versions.delete() + b.objects.delete() except Exception: pass except Exception as e: - print(f"fix_and_retry {stack_name}: {e}") + print(f"fix {name}: {e}") - def _force_delete_role(iam, role_name): + def _force_delete_role(iam, role): try: - for p in iam.list_role_policies(RoleName=role_name)['PolicyNames']: - iam.delete_role_policy(RoleName=role_name, PolicyName=p) - for p in iam.list_attached_role_policies(RoleName=role_name)['AttachedPolicies']: - iam.detach_role_policy(RoleName=role_name, PolicyArn=p['PolicyArn']) - iam.delete_role(RoleName=role_name) + for p in iam.list_role_policies(RoleName=role)['PolicyNames']: + iam.delete_role_policy(RoleName=role, PolicyName=p) + for p in iam.list_attached_role_policies(RoleName=role)['AttachedPolicies']: + iam.detach_role_policy(RoleName=role, PolicyArn=p['PolicyArn']) + iam.delete_role(RoleName=role) except Exception: pass def _force_delete_policy(iam, arn): try: - for page in iam.get_paginator('list_entities_for_policy').paginate(PolicyArn=arn, EntityFilter='Role'): - for r in page['PolicyRoles']: + for pg in iam.get_paginator('list_entities_for_policy').paginate(PolicyArn=arn, EntityFilter='Role'): + for r in pg['PolicyRoles']: iam.detach_role_policy(RoleName=r['RoleName'], PolicyArn=arn) for v in iam.list_policy_versions(PolicyArn=arn)['Versions']: if not v['IsDefaultVersion']: @@ -412,24 +427,22 @@ Resources: iam.delete_policy(PolicyArn=arn) except Exception: pass - def _sweep_log_groups(logs, cutoff, ctx): + def _sweep_logs(logs, cutoff, ctx): cutoff_ms = int(cutoff.timestamp() * 1000) - deleted = 0 for page in logs.get_paginator('describe_log_groups').paginate(): - for log_group in page['logGroups']: + for lg in page['logGroups']: if not _has_time(ctx): return - name = log_group['logGroupName'] - if STACK_PATTERN not in name: + name = lg['logGroupName'] + if not _is_test(name): continue - if log_group.get('creationTime', 0) >= cutoff_ms: + if lg.get('creationTime', 0) >= cutoff_ms: continue try: + print(f"Deleting: {name}") logs.delete_log_group(logGroupName=name) - deleted += 1 time.sleep(1) except Exception: pass - print(f"Log groups: {deleted} deleted") TestStackSweeperSchedule: Type: AWS::Events::Rule From 1c15baefdeb65acc1b38bcbb1b68ae0d97161123 Mon Sep 17 00:00:00 2001 From: Vichy Meas Date: Thu, 28 May 2026 12:06:14 -0700 Subject: [PATCH 2/5] feat: add 8-minute time cap for API Gateway cleanup phase Prevents the API deletion loop (35s sleep per API) from consuming the entire Lambda budget, ensuring stacks and log groups still get cleaned up in the remaining 7 minutes. --- integration/setup/companion-stack.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration/setup/companion-stack.yaml b/integration/setup/companion-stack.yaml index 476437a1f..8799109dd 100644 --- a/integration/setup/companion-stack.yaml +++ b/integration/setup/companion-stack.yaml @@ -354,7 +354,7 @@ Resources: def _sweep_apis(apig, cutoff, ctx): for page in apig.get_paginator('get_rest_apis').paginate(): for api in page['items']: - if not _has_time(ctx): + if ctx.get_remaining_time_in_millis() < 420000: return if not _is_test(api.get('name', '')): continue From 5d0f66ec09fba33f4fc7e1cac6948311ac547fc7 Mon Sep 17 00:00:00 2001 From: Vichy Meas Date: Thu, 28 May 2026 12:18:57 -0700 Subject: [PATCH 3/5] fix: move rate-limit sleep outside try block in API sweep Ensures the 35s sleep runs even when delete_rest_api throws (e.g. TooManyRequestsException), preventing back-to-back failing calls that would worsen throttling. --- integration/setup/companion-stack.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration/setup/companion-stack.yaml b/integration/setup/companion-stack.yaml index 8799109dd..7e3acfe0e 100644 --- a/integration/setup/companion-stack.yaml +++ b/integration/setup/companion-stack.yaml @@ -364,8 +364,8 @@ Resources: try: print(f"Deleting API: {api['name']}") apig.delete_rest_api(restApiId=api['id']) - time.sleep(35) except Exception: pass + time.sleep(35) def _sweep_stacks(cfn, iam, cutoff, ctx): for page in cfn.get_paginator('list_stacks').paginate(StackStatusFilter=ELIGIBLE_STATUSES): From 56d4dc8a42b1ea5b4fcf9fa46e4f1ababf612a18 Mon Sep 17 00:00:00 2001 From: Vichy Meas Date: Thu, 28 May 2026 14:47:04 -0700 Subject: [PATCH 4/5] feat: increase sweeper schedule to every 30 minutes More frequent sweeps reduce orphaned resource accumulation and align better with the 6-hour age cutoff for cleanup eligibility. --- integration/setup/companion-stack.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration/setup/companion-stack.yaml b/integration/setup/companion-stack.yaml index 7e3acfe0e..dc621119b 100644 --- a/integration/setup/companion-stack.yaml +++ b/integration/setup/companion-stack.yaml @@ -447,7 +447,7 @@ Resources: TestStackSweeperSchedule: Type: AWS::Events::Rule Properties: - ScheduleExpression: rate(6 hours) + ScheduleExpression: rate(30 minutes) State: ENABLED Targets: - Arn: !GetAtt TestStackSweeperFunction.Arn From 7cc973bb3caf33885c177b59b9eb7acc3bf01770 Mon Sep 17 00:00:00 2001 From: Vichy Meas Date: Thu, 28 May 2026 14:56:22 -0700 Subject: [PATCH 5/5] fix: wrap API sweep in try/except for phase isolation Ensures _sweep_stacks and _sweep_logs still run even if _sweep_apis encounters an unexpected error like pagination failure. --- integration/setup/companion-stack.yaml | 27 +++++++++++++------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/integration/setup/companion-stack.yaml b/integration/setup/companion-stack.yaml index dc621119b..bfe448346 100644 --- a/integration/setup/companion-stack.yaml +++ b/integration/setup/companion-stack.yaml @@ -352,20 +352,19 @@ Resources: _sweep_logs(logs, cutoff, ctx) def _sweep_apis(apig, cutoff, ctx): - for page in apig.get_paginator('get_rest_apis').paginate(): - for api in page['items']: - if ctx.get_remaining_time_in_millis() < 420000: - return - if not _is_test(api.get('name', '')): - continue - c = api.get('createdDate') - if c and c.replace(tzinfo=timezone.utc) >= cutoff: - continue - try: - print(f"Deleting API: {api['name']}") - apig.delete_rest_api(restApiId=api['id']) - except Exception: pass - time.sleep(35) + try: + for page in apig.get_paginator('get_rest_apis').paginate(): + for api in page['items']: + if ctx.get_remaining_time_in_millis() < 420000: return + if not _is_test(api.get('name', '')): continue + c = api.get('createdDate') + if c and c.replace(tzinfo=timezone.utc) >= cutoff: continue + try: + print(f"Deleting API: {api['name']}") + apig.delete_rest_api(restApiId=api['id']) + except Exception: pass + time.sleep(35) + except Exception as e: print(f"api err: {e}") def _sweep_stacks(cfn, iam, cutoff, ctx): for page in cfn.get_paginator('list_stacks').paginate(StackStatusFilter=ELIGIBLE_STATUSES):