In [1]:
from sagemaker.hyperpod.hyperpod_manager import HyperPodManager

HyperPodManager.list_clusters(region='us-east-2')
HyperPodManager.set_context('hp-cluster-for-inf-Beta2try1', region='us-east-2')

Orchestrator    Cluster Name
--------------  ----------------------------
EKS             hp-cluster-for-inf-Beta2try1
Updated context arn:aws:eks:us-east-2:637423555983:cluster/EKSClusterForInf-Beta2try1 in /tmp/kubeconfig
Successfully set current cluster as: hp-cluster-for-inf-Beta2try1


In [2]:
from sagemaker.hyperpod.inference.config.hp_endpoint_config import CloudWatchTrigger, PrometheusTrigger, AutoScalingSpec, ModelMetrics, Metrics, FsxStorage, ModelSourceConfig, Tags, TlsConfig, ConfigMapKeyRef, FieldRef, ResourceFieldRef, SecretKeyRef, ValueFrom, EnvironmentVariables, ModelInvocationPort, ModelVolumeMount, Claims, Resources, Worker
from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint
import yaml
import time

In [3]:
tls_config=TlsConfig(tls_certificate_output_s3_uri='s3://tls-bucket-inf1-beta2')

model_source_config = ModelSourceConfig(
    model_source_type='fsx',
    model_location="deepseek-1-5b",
    fsx_storage=FsxStorage(
        file_system_id='fs-0e6a92495c35a81f2'
    ),
)

environment_variables = [
    EnvironmentVariables(name="HF_MODEL_ID", value="/opt/ml/model"),
    EnvironmentVariables(name="SAGEMAKER_PROGRAM", value="inference.py"),
    EnvironmentVariables(name="SAGEMAKER_SUBMIT_DIRECTORY", value="/opt/ml/model/code"),
    EnvironmentVariables(name="MODEL_CACHE_ROOT", value="/opt/ml/model"),
    EnvironmentVariables(name="SAGEMAKER_ENV", value="1"),
]

worker = Worker(
    image='763104351884.dkr.ecr.us-east-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.4.0-tgi2.3.1-gpu-py311-cu124-ubuntu22.04-v2.0',
    model_volume_mount=ModelVolumeMount(
        name='model-weights',
    ),
    model_invocation_port=ModelInvocationPort(container_port=8080),
    resources=Resources(
            requests={"cpu": "30000m", "nvidia.com/gpu": 1, "memory": "100Gi"},
            limits={"nvidia.com/gpu": 1}
    ),
    environment_variables=environment_variables,
)

In [4]:
fsx_endpoint = HPEndpoint(
    endpoint_name='test-endpoint-name-fsx-pysdk-07-01-1',
    instance_type='ml.g5.8xlarge',
    # use a new model_name every new deployment.
    # Older image version has a bug that causes 502 bad gateway from invoking endpoint
    model_name='deepseek15b-fsx-test-pysdk-07-01-1',
    tls_config=tls_config,
    model_source_config=model_source_config,
    worker=worker,
)

In [5]:
fsx_endpoint.create()

In [6]:
# poll status
t = 0
timeout = 600  # 600 seconds timeout  
interval = 15  # poll every 15 seconds

while t < timeout:
    # use refresh to fetch latest status
    fsx_endpoint.refresh()

    print('Refreshing instance status...')

    try:
        # deployment status will be available immediately
        deployment_status = fsx_endpoint.status.deploymentStatus.deploymentObjectOverallState
        if deployment_status== 'DeploymentFailed':
            print('Deployment failed!')
            break

        # endpoint status will appear be available from refresh() at some point
        endpoint_status = fsx_endpoint.status.endpoints.sagemaker.state
        if endpoint_status == 'CreationCompleted':
            print('Endpoint is available!')
            break
    except:
        pass

    time.sleep(interval)
    t += interval

if t >= timeout:
    print('Endpoint creation timed out!')

# print endpoint in yaml
def print_yaml(endpoint):
    print(yaml.dump(endpoint.model_dump(exclude_none=True)))

Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Endpoint is available!


In [7]:
# list all endpoints
endpoint_list = HPEndpoint.list()
print_yaml(endpoint_list[0])

endpointName: test-endpoint-name-fsx-pysdk-07-01-1
instanceType: ml.g5.8xlarge
invocationEndpoint: invocations
metadata:
  name: deepseek15b-fsx-test-pysdk-07-01-1
  namespace: default
modelName: deepseek15b-fsx-test-pysdk-07-01-1
modelSourceConfig:
  fsxStorage:
    fileSystemId: fs-0e6a92495c35a81f2
  modelLocation: deepseek-1-5b
  modelSourceType: fsx
  prefetchEnabled: false
replicas: 1
status:
  conditions:
  - lastTransitionTime: '2025-07-01T19:13:44Z'
    message: Deployment or SageMaker endpoint registration creation for model is in
      progress
    reason: InProgress
    status: 'True'
    type: DeploymentInProgress
  - lastTransitionTime: '2025-07-01T19:18:59Z'
    message: Deployment and SageMaker endpoint registration for model have been created
      successfully
    reason: Success
    status: 'True'
    type: DeploymentComplete
  deploymentStatus:
    deploymentObjectOverallState: DeploymentComplete
    lastUpdated: '2025-07-01T19:18:59Z'
    name: deepseek15b-fsx-test

In [8]:
endpoint = HPEndpoint.get(name='deepseek15b-fsx-test-pysdk-07-01-1')

In [9]:
# invoke
data='{"inputs": "What is the capital of Japan?"}'

# invoke
endpoint.invoke(body=data).body.read()





b'[{"generated_text":"What is the capital of Japan? What is the capital of the United States? What is the capital of Brazil? What is the capital of the United Kingdom? What is the capital of Germany? What is the capital of Spain? What is the capital of_connecting_\xe5\x8f\xa3\xe4\xb8\xad_only_have_three_quarter_bagels? What is the capital of China?\\n\\n**Pairing the series:  I: Tokyo, II: New York, III: Paris, IV: London, V: Berlin, VI: Madrid, VII: Moscow, VIII"}]'

In [None]:
# delete endpoint
endpoint.delete()