From 4f5c4c8f933e43ae6199350900e81052f7c25de4 Mon Sep 17 00:00:00 2001 From: Syed Jafri Date: Mon, 27 Apr 2026 23:53:35 +0000 Subject: [PATCH 1/7] test(sagemaker-serve): re-enable skipped integ tests and add cleanup/timing logs --- .../test_model_customization_deployment.py | 100 ++++++++++++++++-- .../tests/integ/test_optimize_integration.py | 63 +++++++++-- 2 files changed, 141 insertions(+), 22 deletions(-) diff --git a/sagemaker-serve/tests/integ/test_model_customization_deployment.py b/sagemaker-serve/tests/integ/test_model_customization_deployment.py index 0eacafbaa2..315326e812 100644 --- a/sagemaker-serve/tests/integ/test_model_customization_deployment.py +++ b/sagemaker-serve/tests/integ/test_model_customization_deployment.py @@ -13,9 +13,13 @@ """Integration tests for ModelBuilder model customization deployment.""" from __future__ import absolute_import +import logging + import pytest import random +logger = logging.getLogger(__name__) + @pytest.fixture(scope="module") def training_job_name(): @@ -54,58 +58,132 @@ def cleanup_e2e_endpoints(): import os from botocore.exceptions import ClientError + logger.info("[cleanup_e2e_endpoints] setup: starting session-scoped sweep") + # This file's tests use us-west-2 resources. Set SAGEMAKER_REGION so the # SDK's SageMakerClient creates sessions in the correct region from the start. # Save/restore to avoid leaking into other test files. original_sm_region = os.environ.get("SAGEMAKER_REGION") os.environ["SAGEMAKER_REGION"] = "us-west-2" + logger.info( + "[cleanup_e2e_endpoints] setup: SAGEMAKER_REGION set to us-west-2 " + "(previous value=%s)", + original_sm_region, + ) from sagemaker.core.resources import Endpoint # Cleanup before tests + pre_deleted = 0 + pre_failed = 0 try: for endpoint in Endpoint.get_all(): try: if endpoint.endpoint_name.startswith('e2e-'): + logger.info( + "[cleanup_e2e_endpoints] setup: deleting stale endpoint %s " + "(status=%s)", + endpoint.endpoint_name, + getattr(endpoint, "endpoint_status", "unknown"), + ) endpoint.delete() - except (ClientError, Exception): - pass - except (ClientError, Exception): - pass + pre_deleted += 1 + except (ClientError, Exception) as exc: + pre_failed += 1 + logger.warning( + "[cleanup_e2e_endpoints] setup: failed to delete %s: %s", + getattr(endpoint, "endpoint_name", ""), + exc, + ) + except (ClientError, Exception) as exc: + logger.warning( + "[cleanup_e2e_endpoints] setup: Endpoint.get_all() failed: %s", exc + ) + logger.info( + "[cleanup_e2e_endpoints] setup: sweep complete (deleted=%d, failed=%d)", + pre_deleted, + pre_failed, + ) yield + logger.info("[cleanup_e2e_endpoints] teardown: starting session-scoped sweep") + # Cleanup after tests + post_deleted = 0 + post_failed = 0 try: for endpoint in Endpoint.get_all(): try: if endpoint.endpoint_name.startswith('e2e-'): + logger.info( + "[cleanup_e2e_endpoints] teardown: deleting endpoint %s " + "(status=%s)", + endpoint.endpoint_name, + getattr(endpoint, "endpoint_status", "unknown"), + ) endpoint.delete() - except (ClientError, Exception): - pass - except (ClientError, Exception): - pass + post_deleted += 1 + except (ClientError, Exception) as exc: + post_failed += 1 + logger.warning( + "[cleanup_e2e_endpoints] teardown: failed to delete %s: %s", + getattr(endpoint, "endpoint_name", ""), + exc, + ) + except (ClientError, Exception) as exc: + logger.warning( + "[cleanup_e2e_endpoints] teardown: Endpoint.get_all() failed: %s", exc + ) + logger.info( + "[cleanup_e2e_endpoints] teardown: sweep complete (deleted=%d, failed=%d)", + post_deleted, + post_failed, + ) # Restore original SAGEMAKER_REGION if original_sm_region: os.environ["SAGEMAKER_REGION"] = original_sm_region elif "SAGEMAKER_REGION" in os.environ: del os.environ["SAGEMAKER_REGION"] + logger.info( + "[cleanup_e2e_endpoints] teardown: SAGEMAKER_REGION restored to %s", + original_sm_region, + ) @pytest.fixture(scope="module") def cleanup_endpoints(): """Track endpoints to cleanup after tests.""" endpoints_to_cleanup = [] + logger.info("[cleanup_endpoints] setup: tracker initialized") yield endpoints_to_cleanup + logger.info( + "[cleanup_endpoints] teardown: processing %d tracked endpoint(s)", + len(endpoints_to_cleanup), + ) + deleted = 0 + failed = 0 for ep_name in endpoints_to_cleanup: try: from sagemaker.core.resources import Endpoint + logger.info("[cleanup_endpoints] teardown: deleting %s", ep_name) endpoint = Endpoint.get(endpoint_name=ep_name) endpoint.delete() - except Exception: - pass + deleted += 1 + except Exception as exc: + failed += 1 + logger.warning( + "[cleanup_endpoints] teardown: failed to delete %s: %s", + ep_name, + exc, + ) + logger.info( + "[cleanup_endpoints] teardown: complete (deleted=%d, failed=%d)", + deleted, + failed, + ) class TestModelCustomizationFromTrainingJob: @@ -127,7 +205,7 @@ def test_build_from_training_job(self, training_job_name): assert model_builder.image_uri is not None assert model_builder.instance_type is not None - @pytest.mark.skip(reason="Skipped: parallel cleanup race condition under investigation") + # @pytest.mark.skip(reason="Skipped: parallel cleanup race condition under investigation") def test_deploy_from_training_job(self, training_job_name, endpoint_name, cleanup_endpoints): """Test deploying model from training job. diff --git a/sagemaker-serve/tests/integ/test_optimize_integration.py b/sagemaker-serve/tests/integ/test_optimize_integration.py index 4df85b6bc6..7a82b97307 100644 --- a/sagemaker-serve/tests/integ/test_optimize_integration.py +++ b/sagemaker-serve/tests/integ/test_optimize_integration.py @@ -34,35 +34,66 @@ AWS_ACCOUNT_ID = "593793038179" AWS_REGION = "us-east-2" - -@pytest.mark.skip(reason="Test takes too long to run") +# @pytest.mark.skip(reason="Test takes too long to run") def test_optimize_build_deploy_invoke_cleanup(): """Integration test for Optimize workflow""" logger.info("Starting Optimize integration test...") - + test_started_at = time.perf_counter() + optimized_model = None core_endpoint = None - + build_deploy_elapsed = None + predict_elapsed = None + try: # Build and deploy logger.info("Optimizing and deploying model...") + phase_started = time.perf_counter() optimized_model, core_endpoint = build_and_deploy() - + build_deploy_elapsed = time.perf_counter() - phase_started + logger.info( + "[timing] build_and_deploy completed in %.2fs", + build_deploy_elapsed, + ) + # Make prediction logger.info("Making prediction...") + phase_started = time.perf_counter() make_prediction(core_endpoint) - + predict_elapsed = time.perf_counter() - phase_started + logger.info( + "[timing] make_prediction completed in %.2fs", + predict_elapsed, + ) + # Test passed successfully logger.info("Optimize integration test completed successfully") - + except Exception as e: logger.error(f"Optimize integration test failed: {str(e)}") raise finally: # Cleanup resources + cleanup_elapsed = None if optimized_model and core_endpoint: logger.info("Cleaning up resources...") + phase_started = time.perf_counter() cleanup_resources(optimized_model, core_endpoint) + cleanup_elapsed = time.perf_counter() - phase_started + logger.info( + "[timing] cleanup_resources completed in %.2fs", + cleanup_elapsed, + ) + + total_elapsed = time.perf_counter() - test_started_at + logger.info( + "[timing] test_optimize_build_deploy_invoke_cleanup total=%.2fs " + "(build_and_deploy=%s, make_prediction=%s, cleanup=%s)", + total_elapsed, + f"{build_deploy_elapsed:.2f}s" if build_deploy_elapsed is not None else "skipped", + f"{predict_elapsed:.2f}s" if predict_elapsed is not None else "skipped", + f"{cleanup_elapsed:.2f}s" if cleanup_elapsed is not None else "skipped", + ) def create_schema_builder(): @@ -91,6 +122,7 @@ def build_and_deploy(): # Optimize the model logger.info("Optimizing JumpStart model...") default_bucket = sagemaker_session.default_bucket() + optimize_started = time.perf_counter() optimized_model = model_builder.optimize( model_name=f"{MODEL_NAME_PREFIX}-{unique_id}", instance_type="ml.g5.2xlarge", @@ -100,17 +132,26 @@ def build_and_deploy(): job_name=f"js-optimize-{int(time.time())}", image_uri="763104351884.dkr.ecr.us-east-2.amazonaws.com/djl-inference:0.31.0-lmi13.0.0-cu124" ) - logger.info(f"Model Successfully Optimized: {optimized_model.model_name}") - + logger.info( + "[timing] model_builder.optimize completed in %.2fs (model=%s)", + time.perf_counter() - optimize_started, + optimized_model.model_name, + ) + # Deploy the optimized model logger.info("Deploying optimized model to endpoint...") + deploy_started = time.perf_counter() core_endpoint = model_builder.deploy( endpoint_name=f"{ENDPOINT_NAME_PREFIX}-{unique_id}", initial_instance_count=1, instance_type="ml.g5.2xlarge" ) - logger.info(f"Endpoint Successfully Created: {core_endpoint.endpoint_name}") - + logger.info( + "[timing] model_builder.deploy completed in %.2fs (endpoint=%s)", + time.perf_counter() - deploy_started, + core_endpoint.endpoint_name, + ) + return optimized_model, core_endpoint From 6ac87778d8689ab3a05f9e9252ccd4c557827e78 Mon Sep 17 00:00:00 2001 From: Syed Jafri Date: Tue, 28 Apr 2026 20:22:57 +0000 Subject: [PATCH 2/7] debug: Add temp log statements to debug tests --- .../src/sagemaker/core/utils/utils.py | 6 +- .../src/sagemaker/serve/model_builder.py | 151 +++++++++++++++++- .../tests/integ/test_optimize_integration.py | 2 +- 3 files changed, 155 insertions(+), 4 deletions(-) diff --git a/sagemaker-core/src/sagemaker/core/utils/utils.py b/sagemaker-core/src/sagemaker/core/utils/utils.py index 2881695969..72bc665a9b 100644 --- a/sagemaker-core/src/sagemaker/core/utils/utils.py +++ b/sagemaker-core/src/sagemaker/core/utils/utils.py @@ -16,6 +16,7 @@ import os import re import subprocess +import traceback from boto3.session import Session from botocore.config import Config @@ -353,7 +354,10 @@ def __init__( session = Session(region_name=region_name) if region_name is None: - logger.warning("No region provided. Using default region.") + logger.warning( + "No region provided. Using default region.\nStack trace:\n%s", + "".join(traceback.format_stack()), + ) region_name = session.region_name if config is None: diff --git a/sagemaker-serve/src/sagemaker/serve/model_builder.py b/sagemaker-serve/src/sagemaker/serve/model_builder.py index 7c7af2defc..3e8a36ed51 100644 --- a/sagemaker-serve/src/sagemaker/serve/model_builder.py +++ b/sagemaker-serve/src/sagemaker/serve/model_builder.py @@ -4381,6 +4381,11 @@ def _deploy_model_customization( # Check if endpoint exists is_existing_endpoint = self._does_endpoint_exist(endpoint_name) + logger.info( + "[_deploy_model_customization] endpoint_name=%s is_existing_endpoint=%s", + endpoint_name, + is_existing_endpoint, + ) if not is_existing_endpoint: EndpointConfig.create( @@ -4394,13 +4399,78 @@ def _deploy_model_customization( ], execution_role_arn=self.role_arn, ) + logger.info( + "[_deploy_model_customization] EndpointConfig created: %s", endpoint_name + ) logger.info("Endpoint core call starting") + # Capture client / region state right at endpoint creation to help + # diagnose "Could not find endpoint" that sometimes surfaces in + # later refresh() calls (singleton region / env var drift). + try: + import os as _os + + from sagemaker.core.utils.utils import SageMakerClient as _SMClient + + _sm_region_env = _os.environ.get("SAGEMAKER_REGION") + _probe_client = _SMClient().sagemaker_client + logger.info( + "[_deploy_model_customization] Client state before " + "Endpoint.create: SAGEMAKER_REGION=%s " + "sagemaker_client.meta.region_name=%s", + _sm_region_env, + getattr(getattr(_probe_client, "meta", None), "region_name", None), + ) + except Exception as _probe_exc: + logger.info( + "[_deploy_model_customization] Could not probe client region " + "state before Endpoint.create: %s", + _probe_exc, + ) endpoint = Endpoint.create( endpoint_name=endpoint_name, endpoint_config_name=endpoint_name ) - endpoint.wait_for_status("InService") + logger.info( + "[_deploy_model_customization] Endpoint.create returned: " + "endpoint_name=%s endpoint_arn=%s endpoint_status=%s", + getattr(endpoint, "endpoint_name", None), + getattr(endpoint, "endpoint_arn", None), + getattr(endpoint, "endpoint_status", None), + ) + try: + endpoint.wait_for_status("InService") + except Exception as wait_exc: + logger.error( + "[_deploy_model_customization] Initial wait_for_status(InService) " + "failed for endpoint_name=%s: %s", + endpoint_name, + wait_exc, + ) + # Try to re-describe to see if the endpoint truly disappeared or + # if this is a transient/consistency issue. + try: + recheck = Endpoint.get(endpoint_name=endpoint_name) + logger.error( + "[_deploy_model_customization] Re-describe after wait failure " + "succeeded: status=%s arn=%s", + getattr(recheck, "endpoint_status", None), + getattr(recheck, "endpoint_arn", None), + ) + except Exception as recheck_exc: + logger.error( + "[_deploy_model_customization] Re-describe after wait failure " + "also failed for %s: %s", + endpoint_name, + recheck_exc, + ) + raise else: endpoint = Endpoint.get(endpoint_name=endpoint_name) + logger.info( + "[_deploy_model_customization] Reusing existing endpoint: " + "endpoint_name=%s endpoint_status=%s", + getattr(endpoint, "endpoint_name", None), + getattr(endpoint, "endpoint_status", None), + ) peft_type = self._fetch_peft() base_model_recipe_name = model_package.inference_specification.containers[ @@ -4454,9 +4524,86 @@ def _deploy_model_customization( # Wait for base IC to be InService before creating adapter base_ic = InferenceComponent.get(inference_component_name=base_ic_name) base_ic.wait_for_status("InService") + logger.info( + "[_deploy_model_customization] base IC InService: name=%s arn=%s", + getattr(base_ic, "inference_component_name", None), + getattr(base_ic, "inference_component_arn", None), + ) + + # Probe endpoint before waiting — we've seen cases where the + # endpoint is reported as missing here. Capture state for diagnosis. + try: + import os as _os + + from sagemaker.core.utils.utils import SageMakerClient as _SMClient + + _sm_region_env = _os.environ.get("SAGEMAKER_REGION") + _probe_client = _SMClient().sagemaker_client + logger.info( + "[_deploy_model_customization] Client state before second " + "wait_for_status: SAGEMAKER_REGION=%s " + "sagemaker_client.meta.region_name=%s", + _sm_region_env, + getattr(getattr(_probe_client, "meta", None), "region_name", None), + ) + except Exception as _probe_exc: + logger.info( + "[_deploy_model_customization] Could not probe client region " + "state before second wait_for_status: %s", + _probe_exc, + ) + try: + probe = Endpoint.get(endpoint_name=endpoint_name) + logger.info( + "[_deploy_model_customization] Pre-wait endpoint probe: " + "endpoint_name=%s endpoint_status=%s endpoint_arn=%s " + "failure_reason=%s", + getattr(probe, "endpoint_name", None), + getattr(probe, "endpoint_status", None), + getattr(probe, "endpoint_arn", None), + getattr(probe, "failure_reason", None), + ) + except Exception as probe_exc: + logger.error( + "[_deploy_model_customization] Pre-wait endpoint probe FAILED " + "for %s before second wait_for_status: %s", + endpoint_name, + probe_exc, + ) # Wait for endpoint to stabilize after base IC creation - endpoint.wait_for_status("InService") + try: + endpoint.wait_for_status("InService") + except Exception as wait_exc: + logger.error( + "[_deploy_model_customization] wait_for_status(InService) after " + "base IC creation failed for endpoint_name=%s " + "(endpoint.endpoint_name=%s, endpoint.endpoint_arn=%s): %s", + endpoint_name, + getattr(endpoint, "endpoint_name", None), + getattr(endpoint, "endpoint_arn", None), + wait_exc, + ) + # Try to list endpoints to see whether this is a region mismatch + # vs. an actual deletion. + try: + visible = [ + ep.endpoint_name + for ep in Endpoint.get_all() + if getattr(ep, "endpoint_name", "").startswith("e2e-") + ] + logger.error( + "[_deploy_model_customization] Visible e2e-* endpoints at " + "failure time: %s", + visible, + ) + except Exception as list_exc: + logger.error( + "[_deploy_model_customization] Endpoint.get_all() also " + "failed: %s", + list_exc, + ) + raise # Deploy adapter IC adapter_ic_name = inference_component_name or f"{endpoint_name}-adapter" diff --git a/sagemaker-serve/tests/integ/test_optimize_integration.py b/sagemaker-serve/tests/integ/test_optimize_integration.py index 7a82b97307..16862bad3d 100644 --- a/sagemaker-serve/tests/integ/test_optimize_integration.py +++ b/sagemaker-serve/tests/integ/test_optimize_integration.py @@ -34,7 +34,7 @@ AWS_ACCOUNT_ID = "593793038179" AWS_REGION = "us-east-2" -# @pytest.mark.skip(reason="Test takes too long to run") +@pytest.mark.skip(reason="Test takes too long to run") def test_optimize_build_deploy_invoke_cleanup(): """Integration test for Optimize workflow""" logger.info("Starting Optimize integration test...") From 999d37f87b8085b5979596dbe63d2d3b2ae4d8d1 Mon Sep 17 00:00:00 2001 From: Syed Jafri Date: Tue, 28 Apr 2026 21:32:24 +0000 Subject: [PATCH 3/7] debug: Add more logging around endpoint.wait_for_status + re-fetch endpoint for freshness --- .../src/sagemaker/serve/model_builder.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/sagemaker-serve/src/sagemaker/serve/model_builder.py b/sagemaker-serve/src/sagemaker/serve/model_builder.py index 3e8a36ed51..40650f1b41 100644 --- a/sagemaker-serve/src/sagemaker/serve/model_builder.py +++ b/sagemaker-serve/src/sagemaker/serve/model_builder.py @@ -4573,6 +4573,27 @@ def _deploy_model_customization( # Wait for endpoint to stabilize after base IC creation try: + # Re-fetch the endpoint object to ensure fresh state — + # the original object from Endpoint.create may hold a stale + # internal client or cached attributes that cause + # DescribeEndpoint to fail with "Could not find endpoint". + endpoint = Endpoint.get(endpoint_name=endpoint_name) + logger.info( + "[_deploy_model_customization] Endpoint.get before second " + "wait_for_status returned: endpoint_name=%s endpoint_arn=%s " + "endpoint_status=%s endpoint_config_name=%s " + "creation_time=%s last_modified_time=%s failure_reason=%s " + "production_variants=%s pending_deployment_summary=%s", + getattr(endpoint, "endpoint_name", None), + getattr(endpoint, "endpoint_arn", None), + getattr(endpoint, "endpoint_status", None), + getattr(endpoint, "endpoint_config_name", None), + getattr(endpoint, "creation_time", None), + getattr(endpoint, "last_modified_time", None), + getattr(endpoint, "failure_reason", None), + getattr(endpoint, "production_variants", None), + getattr(endpoint, "pending_deployment_summary", None), + ) endpoint.wait_for_status("InService") except Exception as wait_exc: logger.error( From 2982557d896aefe5a7da87f8f2e147d6a7ddd012 Mon Sep 17 00:00:00 2001 From: Syed Jafri Date: Tue, 28 Apr 2026 23:13:42 +0000 Subject: [PATCH 4/7] debug: disable endpoint cleanup --- .../src/sagemaker/serve/model_builder.py | 4 +- .../test_model_customization_deployment.py | 196 +++++++++--------- 2 files changed, 101 insertions(+), 99 deletions(-) diff --git a/sagemaker-serve/src/sagemaker/serve/model_builder.py b/sagemaker-serve/src/sagemaker/serve/model_builder.py index 40650f1b41..124b88f89a 100644 --- a/sagemaker-serve/src/sagemaker/serve/model_builder.py +++ b/sagemaker-serve/src/sagemaker/serve/model_builder.py @@ -4577,7 +4577,8 @@ def _deploy_model_customization( # the original object from Endpoint.create may hold a stale # internal client or cached attributes that cause # DescribeEndpoint to fail with "Could not find endpoint". - endpoint = Endpoint.get(endpoint_name=endpoint_name) + logger.info("Attempting to fetch endpoint: %s", endpoint_name) + endpoint = Endpoint.get(endpoint_name=endpoint_name, region="us-west-2") logger.info( "[_deploy_model_customization] Endpoint.get before second " "wait_for_status returned: endpoint_name=%s endpoint_arn=%s " @@ -4604,6 +4605,7 @@ def _deploy_model_customization( getattr(endpoint, "endpoint_name", None), getattr(endpoint, "endpoint_arn", None), wait_exc, + exc_info=True, ) # Try to list endpoints to see whether this is a region mismatch # vs. an actual deletion. diff --git a/sagemaker-serve/tests/integ/test_model_customization_deployment.py b/sagemaker-serve/tests/integ/test_model_customization_deployment.py index 315326e812..0572dd3a48 100644 --- a/sagemaker-serve/tests/integ/test_model_customization_deployment.py +++ b/sagemaker-serve/tests/integ/test_model_customization_deployment.py @@ -71,75 +71,75 @@ def cleanup_e2e_endpoints(): original_sm_region, ) - from sagemaker.core.resources import Endpoint - - # Cleanup before tests - pre_deleted = 0 - pre_failed = 0 - try: - for endpoint in Endpoint.get_all(): - try: - if endpoint.endpoint_name.startswith('e2e-'): - logger.info( - "[cleanup_e2e_endpoints] setup: deleting stale endpoint %s " - "(status=%s)", - endpoint.endpoint_name, - getattr(endpoint, "endpoint_status", "unknown"), - ) - endpoint.delete() - pre_deleted += 1 - except (ClientError, Exception) as exc: - pre_failed += 1 - logger.warning( - "[cleanup_e2e_endpoints] setup: failed to delete %s: %s", - getattr(endpoint, "endpoint_name", ""), - exc, - ) - except (ClientError, Exception) as exc: - logger.warning( - "[cleanup_e2e_endpoints] setup: Endpoint.get_all() failed: %s", exc - ) - logger.info( - "[cleanup_e2e_endpoints] setup: sweep complete (deleted=%d, failed=%d)", - pre_deleted, - pre_failed, - ) + # from sagemaker.core.resources import Endpoint + + # # Cleanup before tests + # pre_deleted = 0 + # pre_failed = 0 + # try: + # for endpoint in Endpoint.get_all(): + # try: + # if endpoint.endpoint_name.startswith('e2e-'): + # logger.info( + # "[cleanup_e2e_endpoints] setup: deleting stale endpoint %s " + # "(status=%s)", + # endpoint.endpoint_name, + # getattr(endpoint, "endpoint_status", "unknown"), + # ) + # endpoint.delete() + # pre_deleted += 1 + # except (ClientError, Exception) as exc: + # pre_failed += 1 + # logger.warning( + # "[cleanup_e2e_endpoints] setup: failed to delete %s: %s", + # getattr(endpoint, "endpoint_name", ""), + # exc, + # ) + # except (ClientError, Exception) as exc: + # logger.warning( + # "[cleanup_e2e_endpoints] setup: Endpoint.get_all() failed: %s", exc + # ) + # logger.info( + # "[cleanup_e2e_endpoints] setup: sweep complete (deleted=%d, failed=%d)", + # pre_deleted, + # pre_failed, + # ) yield - logger.info("[cleanup_e2e_endpoints] teardown: starting session-scoped sweep") - - # Cleanup after tests - post_deleted = 0 - post_failed = 0 - try: - for endpoint in Endpoint.get_all(): - try: - if endpoint.endpoint_name.startswith('e2e-'): - logger.info( - "[cleanup_e2e_endpoints] teardown: deleting endpoint %s " - "(status=%s)", - endpoint.endpoint_name, - getattr(endpoint, "endpoint_status", "unknown"), - ) - endpoint.delete() - post_deleted += 1 - except (ClientError, Exception) as exc: - post_failed += 1 - logger.warning( - "[cleanup_e2e_endpoints] teardown: failed to delete %s: %s", - getattr(endpoint, "endpoint_name", ""), - exc, - ) - except (ClientError, Exception) as exc: - logger.warning( - "[cleanup_e2e_endpoints] teardown: Endpoint.get_all() failed: %s", exc - ) - logger.info( - "[cleanup_e2e_endpoints] teardown: sweep complete (deleted=%d, failed=%d)", - post_deleted, - post_failed, - ) + # logger.info("[cleanup_e2e_endpoints] teardown: starting session-scoped sweep") + + # # Cleanup after tests + # post_deleted = 0 + # post_failed = 0 + # try: + # for endpoint in Endpoint.get_all(): + # try: + # if endpoint.endpoint_name.startswith('e2e-'): + # logger.info( + # "[cleanup_e2e_endpoints] teardown: deleting endpoint %s " + # "(status=%s)", + # endpoint.endpoint_name, + # getattr(endpoint, "endpoint_status", "unknown"), + # ) + # endpoint.delete() + # post_deleted += 1 + # except (ClientError, Exception) as exc: + # post_failed += 1 + # logger.warning( + # "[cleanup_e2e_endpoints] teardown: failed to delete %s: %s", + # getattr(endpoint, "endpoint_name", ""), + # exc, + # ) + # except (ClientError, Exception) as exc: + # logger.warning( + # "[cleanup_e2e_endpoints] teardown: Endpoint.get_all() failed: %s", exc + # ) + # logger.info( + # "[cleanup_e2e_endpoints] teardown: sweep complete (deleted=%d, failed=%d)", + # post_deleted, + # post_failed, + # ) # Restore original SAGEMAKER_REGION if original_sm_region: @@ -152,38 +152,38 @@ def cleanup_e2e_endpoints(): ) -@pytest.fixture(scope="module") -def cleanup_endpoints(): - """Track endpoints to cleanup after tests.""" - endpoints_to_cleanup = [] - logger.info("[cleanup_endpoints] setup: tracker initialized") - yield endpoints_to_cleanup - - logger.info( - "[cleanup_endpoints] teardown: processing %d tracked endpoint(s)", - len(endpoints_to_cleanup), - ) - deleted = 0 - failed = 0 - for ep_name in endpoints_to_cleanup: - try: - from sagemaker.core.resources import Endpoint - logger.info("[cleanup_endpoints] teardown: deleting %s", ep_name) - endpoint = Endpoint.get(endpoint_name=ep_name) - endpoint.delete() - deleted += 1 - except Exception as exc: - failed += 1 - logger.warning( - "[cleanup_endpoints] teardown: failed to delete %s: %s", - ep_name, - exc, - ) - logger.info( - "[cleanup_endpoints] teardown: complete (deleted=%d, failed=%d)", - deleted, - failed, - ) +# @pytest.fixture(scope="module") +# def cleanup_endpoints(): +# """Track endpoints to cleanup after tests.""" +# endpoints_to_cleanup = [] +# logger.info("[cleanup_endpoints] setup: tracker initialized") +# yield endpoints_to_cleanup + +# logger.info( +# "[cleanup_endpoints] teardown: processing %d tracked endpoint(s)", +# len(endpoints_to_cleanup), +# ) +# deleted = 0 +# failed = 0 +# for ep_name in endpoints_to_cleanup: +# try: +# from sagemaker.core.resources import Endpoint +# logger.info("[cleanup_endpoints] teardown: deleting %s", ep_name) +# endpoint = Endpoint.get(endpoint_name=ep_name) +# endpoint.delete() +# deleted += 1 +# except Exception as exc: +# failed += 1 +# logger.warning( +# "[cleanup_endpoints] teardown: failed to delete %s: %s", +# ep_name, +# exc, +# ) +# logger.info( +# "[cleanup_endpoints] teardown: complete (deleted=%d, failed=%d)", +# deleted, +# failed, +# ) class TestModelCustomizationFromTrainingJob: From 4a25e98cdaaaf16824a4679b21d3895fd5519a81 Mon Sep 17 00:00:00 2001 From: Syed Jafri Date: Tue, 28 Apr 2026 23:38:56 +0000 Subject: [PATCH 5/7] debug: fixed var not defined error due to disabled fixture --- .../test_model_customization_deployment.py | 64 +++++++++---------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/sagemaker-serve/tests/integ/test_model_customization_deployment.py b/sagemaker-serve/tests/integ/test_model_customization_deployment.py index 0572dd3a48..2fd4e3360a 100644 --- a/sagemaker-serve/tests/integ/test_model_customization_deployment.py +++ b/sagemaker-serve/tests/integ/test_model_customization_deployment.py @@ -152,38 +152,38 @@ def cleanup_e2e_endpoints(): ) -# @pytest.fixture(scope="module") -# def cleanup_endpoints(): -# """Track endpoints to cleanup after tests.""" -# endpoints_to_cleanup = [] -# logger.info("[cleanup_endpoints] setup: tracker initialized") -# yield endpoints_to_cleanup - -# logger.info( -# "[cleanup_endpoints] teardown: processing %d tracked endpoint(s)", -# len(endpoints_to_cleanup), -# ) -# deleted = 0 -# failed = 0 -# for ep_name in endpoints_to_cleanup: -# try: -# from sagemaker.core.resources import Endpoint -# logger.info("[cleanup_endpoints] teardown: deleting %s", ep_name) -# endpoint = Endpoint.get(endpoint_name=ep_name) -# endpoint.delete() -# deleted += 1 -# except Exception as exc: -# failed += 1 -# logger.warning( -# "[cleanup_endpoints] teardown: failed to delete %s: %s", -# ep_name, -# exc, -# ) -# logger.info( -# "[cleanup_endpoints] teardown: complete (deleted=%d, failed=%d)", -# deleted, -# failed, -# ) +@pytest.fixture(scope="module") +def cleanup_endpoints(): + """Track endpoints to cleanup after tests.""" + endpoints_to_cleanup = [] + logger.info("[cleanup_endpoints] setup: tracker initialized") + yield endpoints_to_cleanup + + logger.info( + "[cleanup_endpoints] teardown: processing %d tracked endpoint(s)", + len(endpoints_to_cleanup), + ) + deleted = 0 + failed = 0 + for ep_name in endpoints_to_cleanup: + try: + from sagemaker.core.resources import Endpoint + logger.info("[cleanup_endpoints] teardown: deleting %s", ep_name) + endpoint = Endpoint.get(endpoint_name=ep_name) + # endpoint.delete() + deleted += 1 + except Exception as exc: + failed += 1 + logger.warning( + "[cleanup_endpoints] teardown: failed to delete %s: %s", + ep_name, + exc, + ) + logger.info( + "[cleanup_endpoints] teardown: complete (deleted=%d, failed=%d)", + deleted, + failed, + ) class TestModelCustomizationFromTrainingJob: From db780df8542c7b15213c40ce682af7110bc204d2 Mon Sep 17 00:00:00 2001 From: Syed Jafri Date: Wed, 29 Apr 2026 16:07:08 +0000 Subject: [PATCH 6/7] fix: re-enable cleanup --- .../test_model_customization_deployment.py | 134 +++++++++--------- 1 file changed, 67 insertions(+), 67 deletions(-) diff --git a/sagemaker-serve/tests/integ/test_model_customization_deployment.py b/sagemaker-serve/tests/integ/test_model_customization_deployment.py index 2fd4e3360a..315326e812 100644 --- a/sagemaker-serve/tests/integ/test_model_customization_deployment.py +++ b/sagemaker-serve/tests/integ/test_model_customization_deployment.py @@ -71,75 +71,75 @@ def cleanup_e2e_endpoints(): original_sm_region, ) - # from sagemaker.core.resources import Endpoint - - # # Cleanup before tests - # pre_deleted = 0 - # pre_failed = 0 - # try: - # for endpoint in Endpoint.get_all(): - # try: - # if endpoint.endpoint_name.startswith('e2e-'): - # logger.info( - # "[cleanup_e2e_endpoints] setup: deleting stale endpoint %s " - # "(status=%s)", - # endpoint.endpoint_name, - # getattr(endpoint, "endpoint_status", "unknown"), - # ) - # endpoint.delete() - # pre_deleted += 1 - # except (ClientError, Exception) as exc: - # pre_failed += 1 - # logger.warning( - # "[cleanup_e2e_endpoints] setup: failed to delete %s: %s", - # getattr(endpoint, "endpoint_name", ""), - # exc, - # ) - # except (ClientError, Exception) as exc: - # logger.warning( - # "[cleanup_e2e_endpoints] setup: Endpoint.get_all() failed: %s", exc - # ) - # logger.info( - # "[cleanup_e2e_endpoints] setup: sweep complete (deleted=%d, failed=%d)", - # pre_deleted, - # pre_failed, - # ) + from sagemaker.core.resources import Endpoint + + # Cleanup before tests + pre_deleted = 0 + pre_failed = 0 + try: + for endpoint in Endpoint.get_all(): + try: + if endpoint.endpoint_name.startswith('e2e-'): + logger.info( + "[cleanup_e2e_endpoints] setup: deleting stale endpoint %s " + "(status=%s)", + endpoint.endpoint_name, + getattr(endpoint, "endpoint_status", "unknown"), + ) + endpoint.delete() + pre_deleted += 1 + except (ClientError, Exception) as exc: + pre_failed += 1 + logger.warning( + "[cleanup_e2e_endpoints] setup: failed to delete %s: %s", + getattr(endpoint, "endpoint_name", ""), + exc, + ) + except (ClientError, Exception) as exc: + logger.warning( + "[cleanup_e2e_endpoints] setup: Endpoint.get_all() failed: %s", exc + ) + logger.info( + "[cleanup_e2e_endpoints] setup: sweep complete (deleted=%d, failed=%d)", + pre_deleted, + pre_failed, + ) yield - # logger.info("[cleanup_e2e_endpoints] teardown: starting session-scoped sweep") - - # # Cleanup after tests - # post_deleted = 0 - # post_failed = 0 - # try: - # for endpoint in Endpoint.get_all(): - # try: - # if endpoint.endpoint_name.startswith('e2e-'): - # logger.info( - # "[cleanup_e2e_endpoints] teardown: deleting endpoint %s " - # "(status=%s)", - # endpoint.endpoint_name, - # getattr(endpoint, "endpoint_status", "unknown"), - # ) - # endpoint.delete() - # post_deleted += 1 - # except (ClientError, Exception) as exc: - # post_failed += 1 - # logger.warning( - # "[cleanup_e2e_endpoints] teardown: failed to delete %s: %s", - # getattr(endpoint, "endpoint_name", ""), - # exc, - # ) - # except (ClientError, Exception) as exc: - # logger.warning( - # "[cleanup_e2e_endpoints] teardown: Endpoint.get_all() failed: %s", exc - # ) - # logger.info( - # "[cleanup_e2e_endpoints] teardown: sweep complete (deleted=%d, failed=%d)", - # post_deleted, - # post_failed, - # ) + logger.info("[cleanup_e2e_endpoints] teardown: starting session-scoped sweep") + + # Cleanup after tests + post_deleted = 0 + post_failed = 0 + try: + for endpoint in Endpoint.get_all(): + try: + if endpoint.endpoint_name.startswith('e2e-'): + logger.info( + "[cleanup_e2e_endpoints] teardown: deleting endpoint %s " + "(status=%s)", + endpoint.endpoint_name, + getattr(endpoint, "endpoint_status", "unknown"), + ) + endpoint.delete() + post_deleted += 1 + except (ClientError, Exception) as exc: + post_failed += 1 + logger.warning( + "[cleanup_e2e_endpoints] teardown: failed to delete %s: %s", + getattr(endpoint, "endpoint_name", ""), + exc, + ) + except (ClientError, Exception) as exc: + logger.warning( + "[cleanup_e2e_endpoints] teardown: Endpoint.get_all() failed: %s", exc + ) + logger.info( + "[cleanup_e2e_endpoints] teardown: sweep complete (deleted=%d, failed=%d)", + post_deleted, + post_failed, + ) # Restore original SAGEMAKER_REGION if original_sm_region: @@ -170,7 +170,7 @@ def cleanup_endpoints(): from sagemaker.core.resources import Endpoint logger.info("[cleanup_endpoints] teardown: deleting %s", ep_name) endpoint = Endpoint.get(endpoint_name=ep_name) - # endpoint.delete() + endpoint.delete() deleted += 1 except Exception as exc: failed += 1 From ddb10157964d9366c9c368d785546d6b3cf152b8 Mon Sep 17 00:00:00 2001 From: Syed Jafri Date: Wed, 29 Apr 2026 18:21:17 +0000 Subject: [PATCH 7/7] debug: disable def cleanup_endpoints() --- .../tests/integ/test_model_customization_deployment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sagemaker-serve/tests/integ/test_model_customization_deployment.py b/sagemaker-serve/tests/integ/test_model_customization_deployment.py index 315326e812..00e069000e 100644 --- a/sagemaker-serve/tests/integ/test_model_customization_deployment.py +++ b/sagemaker-serve/tests/integ/test_model_customization_deployment.py @@ -86,7 +86,7 @@ def cleanup_e2e_endpoints(): endpoint.endpoint_name, getattr(endpoint, "endpoint_status", "unknown"), ) - endpoint.delete() + # endpoint.delete() pre_deleted += 1 except (ClientError, Exception) as exc: pre_failed += 1