In [1]:
from sagemaker.hyperpod.hyperpod_manager import HyperPodManager

HyperPodManager.list_clusters(region='us-east-2')
HyperPodManager.set_context('hp-cluster-for-inf-Beta2try1', region='us-east-2')

Orchestrator    Cluster Name
--------------  ----------------------------
EKS             hp-cluster-for-inf-Beta2try1
Updated context arn:aws:eks:us-east-2:637423555983:cluster/EKSClusterForInf-Beta2try1 in /tmp/kubeconfig
Successfully set current cluster as: hp-cluster-for-inf-Beta2try1


### Create deployment from spec object

In [1]:
from sagemaker.hyperpod.inference.config.hp_endpoint_config import CloudWatchTrigger, Dimensions, PrometheusTrigger, AutoScalingSpec, ModelMetrics, Metrics, FsxStorage, S3Storage, ModelSourceConfig, Tags, TlsConfig, ConfigMapKeyRef, FieldRef, ResourceFieldRef, SecretKeyRef, ValueFrom, EnvironmentVariables, ModelInvocationPort, ModelVolumeMount, Claims, Resources, Worker
from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint
import yaml
import time

In [2]:
tls_config=TlsConfig(tls_certificate_output_s3_uri='s3://tls-bucket-inf1-beta2')

model_source_config = ModelSourceConfig(
    model_source_type='s3',
    model_location="deepseek15b",
    s3_storage=S3Storage(
        bucket_name='test-model-s3-zhaoqi',
        region='us-east-2',
    ),
)

environment_variables = [
    EnvironmentVariables(name="HF_MODEL_ID", value="/opt/ml/model"),
    EnvironmentVariables(name="SAGEMAKER_PROGRAM", value="inference.py"),
    EnvironmentVariables(name="SAGEMAKER_SUBMIT_DIRECTORY", value="/opt/ml/model/code"),
    EnvironmentVariables(name="MODEL_CACHE_ROOT", value="/opt/ml/model"),
    EnvironmentVariables(name="SAGEMAKER_ENV", value="1"),
]

worker = Worker(
    image='763104351884.dkr.ecr.us-east-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.4.0-tgi2.3.1-gpu-py311-cu124-ubuntu22.04-v2.0',
    model_volume_mount=ModelVolumeMount(
        name='model-weights',
    ),
    model_invocation_port=ModelInvocationPort(container_port=8080),
    resources=Resources(
            requests={"cpu": "30000m", "nvidia.com/gpu": 1, "memory": "100Gi"},
            limits={"nvidia.com/gpu": 1}
    ),
    environment_variables=environment_variables,
)

# Create dimensions
dimensions = [
    Dimensions(name="EndpointName", value="test-endpoint-name-07-01-2"),
    Dimensions(name="VariantName", value="AllTraffic")
]

# Create CloudWatch trigger
cloudwatch_trigger = CloudWatchTrigger(
    dimensions=dimensions,
    metric_collection_period=30,
    metric_name="Invocations",
    metric_stat="Sum",
    metric_type="Average",
    min_value=0.0,
    name="SageMaker-Invocations",
    namespace="AWS/SageMaker",
    target_value=10,
    use_cached_metrics=False
)

# Create autoscaling spec
auto_scaling_spec = AutoScalingSpec(
    cloud_watch_trigger=cloudwatch_trigger
)

# Create metrics
metrics = Metrics(enabled=True)

In [3]:
s3_endpoint = HPEndpoint(
    endpoint_name='s3-test-endpoint-name-07-01-2',
    instance_type='ml.g5.8xlarge',
    # use a new model_name every new deployment.
    # Older image version has a bug that causes 502 bad gateway from invoking endpoint 
    model_name='deepseek15b-test-model-name-07-01-2',  
    tls_config=tls_config,
    model_source_config=model_source_config,
    worker=worker,
    auto_scaling_spec=auto_scaling_spec,
    metrics=metrics,
)

In [6]:
s3_endpoint.create()

In [7]:
# poll status
t = 0
timeout = 600  # 600 seconds timeout  
interval = 15  # poll every 15 seconds

while t < timeout:
    # use refresh to fetch latest status
    s3_endpoint.refresh()
    
    print('Refreshing instance status...')

    try:
        # deployment status will be available immediately
        deployment_status = s3_endpoint.status.deploymentStatus.deploymentObjectOverallState
        if deployment_status== 'DeploymentFailed':
            print('Deployment failed!')
            break

        # endpoint status will appear be available from refresh() at some point
        endpoint_status = s3_endpoint.status.endpoints.sagemaker.state
        if endpoint_status == 'CreationCompleted':
            print('Endpoint is available!')
            break
    except:
        pass

    time.sleep(interval)
    t += interval
    
if t >= timeout:
    print('Endpoint creation timed out!')

# print endpoint in yaml
def print_yaml(endpoint):
    print(yaml.dump(endpoint.model_dump(exclude_none=True)))

Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Endpoint is available!


### List all endpoints

In [8]:
endpoint_list = HPEndpoint.list()
print_yaml(endpoint_list[1])

autoScalingSpec:
  cloudWatchTrigger:
    dimensions:
    - name: EndpointName
      value: test-endpoint-name-06-30-3
    - name: VariantName
      value: AllTraffic
    metricCollectionPeriod: 30
    metricCollectionStartTime: 300
    metricName: Invocations
    metricStat: Sum
    metricType: Average
    minValue: 0.0
    name: SageMaker-Invocations
    namespace: AWS/SageMaker
    targetValue: 10.0
    useCachedMetrics: true
  cooldownPeriod: 300
  initialCooldownPeriod: 300
  maxReplicaCount: 5
  minReplicaCount: 1
  pollingInterval: 30
  prometheusTrigger:
    metricType: Average
    useCachedMetrics: true
  scaleDownStabilizationTime: 300
  scaleUpStabilizationTime: 0
endpointName: s3-test-endpoint-name-07-01-1
instanceType: ml.g5.8xlarge
invocationEndpoint: invocations
metadata:
  name: deepseek15b-test-model-name-07-01-1
  namespace: default
metrics:
  enabled: true
  metricsScrapeIntervalSeconds: 15
modelName: deepseek15b-test-model-name-07-01-1
modelSourceConfig:
  fsxStorag

In [8]:
# get operator logs
print(s3_endpoint.get_operator_logs(since_hours=0.5))

# get specific pod log
# js_endpoint.get_logs(pod='pod-name')

2025-07-01T18:42:07.761627897Z 2025-07-01T18:42:07Z	INFO	InferenceEndpointConfigReconciler	Reconciling InferenceEndpointConfig CRD	{"request": {"name":"deepseek15b-test-model-name-07-01-1","namespace":"default"}, "Region": "us-east-2", "EXECUTION_ROLE_ARN": "arn:aws:iam::637423555983:role/EKSClusterForInf-Beta2try1-cluster-role"}
2025-07-01T18:42:07.761663838Z 2025-07-01T18:42:07Z	INFO	InferenceEndpointConfigReconciler	No changes detected in InferenceEndpointConfig	{"name": {"name":"deepseek15b-test-model-name-07-01-1","namespace":"default"}}
2025-07-01T18:42:07.786773109Z 2025-07-01T18:42:07Z	INFO	ScaledObjectHandler	ScaledObject with same name already exists in namespace.	{"ExistingScaledObject": {"metadata":{"name":"deepseek15b-test-model-name-07-01-1-scaled-object","namespace":"default","uid":"2bc621bb-d3a7-4495-8e93-48c1ab5c3d14","resourceVersion":"13764964","generation":1,"creationTimestamp":"2025-07-01T18:36:06Z","labels":{"scaledobject.keda.sh/name":"deepseek15b-test-model-name

### Invoke endpoint

In [10]:
data='{"inputs": "What is the capital of Japan?"}'

# invoke
s3_endpoint.invoke(body=data).body.read()

b'[{"generated_text":"What is the capital of Japan?).\\n Crimsonursive Discrimination. Let\xe2\x80\x99s Answer What\xe2\x80\x99s the Case against Black.Unicode Characters in Lisa See PhD\\nQuestion: What is the capital of Japan?\\n\\nAnswer: The capital of Japan is Tokyo.\\n</think>\\n\\nThe capital of Japan is Tokyo."}]'

In [None]:
# delete endpoint
endpoint.delete()