In [1]:
from sagemaker.hyperpod.hyperpod_manager import HyperPodManager

HyperPodManager.list_clusters(region='us-east-2')
HyperPodManager.set_context('hp-cluster-for-inf-Beta2try1', region='us-east-2')

Orchestrator    Cluster Name
--------------  ----------------------------
EKS             hp-cluster-for-inf-Beta2try1
Updated context arn:aws:eks:us-east-2:637423555983:cluster/EKSClusterForInf-Beta2try1 in /tmp/kubeconfig
Successfully set current cluster as: hp-cluster-for-inf-Beta2try1


### Create deployment from spec object

In [1]:
from sagemaker.hyperpod.inference.config.hp_endpoint_config import CloudWatchTrigger, PrometheusTrigger, AutoScalingSpec, ModelMetrics, Metrics, FsxStorage, S3Storage, ModelSourceConfig, Tags, TlsConfig, ConfigMapKeyRef, FieldRef, ResourceFieldRef, SecretKeyRef, ValueFrom, EnvironmentVariables, ModelInvocationPort, ModelVolumeMount, Claims, Resources, Worker
from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint

In [3]:
tls_config=TlsConfig(tls_certificate_output_s3_uri='s3://tls-bucket-inf1-beta2')

model_source_config = ModelSourceConfig(
    model_source_type='s3',
    model_location="deepseek15b",
    s3_storage=S3Storage(
        bucket_name='test-model-s3-zhaoqi',
        region='us-east-2',
    ),
)

environment_variables = [
    EnvironmentVariables(name="HF_MODEL_ID", value="/opt/ml/model"),
    EnvironmentVariables(name="SAGEMAKER_PROGRAM", value="inference.py"),
    EnvironmentVariables(name="SAGEMAKER_SUBMIT_DIRECTORY", value="/opt/ml/model/code"),
    EnvironmentVariables(name="MODEL_CACHE_ROOT", value="/opt/ml/model"),
    EnvironmentVariables(name="SAGEMAKER_ENV", value="1"),
]

worker = Worker(
    image='763104351884.dkr.ecr.us-east-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.4.0-tgi2.3.1-gpu-py311-cu124-ubuntu22.04-v2.0',
    model_volume_mount=ModelVolumeMount(
        name='model-weights',
    ),
    model_invocation_port=ModelInvocationPort(container_port=8080),
    resources=Resources(
            requests={"cpu": "30000m", "nvidia.com/gpu": 1, "memory": "100Gi"},
            limits={"nvidia.com/gpu": 1}
    ),
    environment_variables=environment_variables,
)

In [4]:
s3_endpoint = HPEndpoint(
    endpoint_name='test-endpoint-name-zhaoqi-06-28-1',
    instance_type='ml.g5.8xlarge',
    model_name='deepseek15b-test-zhaoqi-06-28-1',
    tls_config=tls_config,
    model_source_config=model_source_config,
    worker=worker,
)

In [5]:
s3_endpoint.create()


Deploying model and its endpoint... The process may take a few minutes.


In [11]:
s3_endpoint.refresh()

In [12]:
# print refreshed config
import yaml
print(yaml.dump(s3_endpoint.model_dump(exclude_none=True)))

endpointName: test-endpoint-name-zhaoqi-06-28-1
instanceType: ml.g5.8xlarge
invocationEndpoint: invocations
modelName: deepseek15b-test-zhaoqi-06-28-1
modelSourceConfig:
  modelLocation: deepseek15b
  modelSourceType: s3
  prefetchEnabled: false
  s3Storage:
    bucketName: test-model-s3-zhaoqi
    region: us-east-2
namespace: default
replicas: 1
status:
  conditions:
  - lastTransitionTime: '2025-06-29T00:51:21Z'
    message: Deployment or SageMaker endpoint registration creation for model is in
      progress
    reason: InProgress
    status: 'True'
    type: DeploymentInProgress
  - lastTransitionTime: '2025-06-29T00:56:36Z'
    message: Deployment and SageMaker endpoint registration for model have been created
      successfully
    reason: Success
    status: 'True'
    type: DeploymentComplete
  deploymentStatus:
    deploymentObjectOverallState: DeploymentComplete
    lastUpdated: '2025-06-29T00:56:37Z'
    name: deepseek15b-test-zhaoqi-06-28-1
    reason: NativeDeploymentObjec

### List all endpoints

In [14]:
endpoints = HPEndpoint.list()
endpoints

[HPEndpoint(InitialReplicaCount=None, autoScalingSpec=None, endpointName='test-endpoint-name-fsx-zhaoqi-pysdk', instanceType='ml.g5.8xlarge', invocationEndpoint='invocations', metrics=None, modelName='deepseek15b-fsx-test-zhaoqi-pysdk', modelSourceConfig=ModelSourceConfig(fsxStorage=FsxStorage(dnsName=None, fileSystemId='fs-0e6a92495c35a81f2', mountName=None), modelLocation='deepseek-1-5b', modelSourceType='fsx', prefetchEnabled=False, s3Storage=None), modelVersion=None, replicas=1, tags=None, tlsConfig=TlsConfig(tlsCertificateOutputS3Uri='s3://tls-bucket-inf1-beta2'), worker=Worker(environmentVariables=[EnvironmentVariables(name='HF_MODEL_ID', value='/opt/ml/model', valueFrom=None), EnvironmentVariables(name='SAGEMAKER_PROGRAM', value='inference.py', valueFrom=None), EnvironmentVariables(name='SAGEMAKER_SUBMIT_DIRECTORY', value='/opt/ml/model/code', valueFrom=None), EnvironmentVariables(name='MODEL_CACHE_ROOT', value='/opt/ml/model', valueFrom=None), EnvironmentVariables(name='SAGEMAK

### Invoke endpoint

In [4]:
endpoint = HPEndpoint.get(name='deepseek15b-test-zhaoqi-06-28-1')

# another way to get endpoint object
# endpoint = HPEndpoint.list()[0]

In [5]:
data='{"inputs": "What is the capital of Japan?"}'

# invoke
response=endpoint.invoke(body=data, content_type='application/json')
response.body.read()

b'[{"generated_text":"What is the capital of Japan? How about its population? What else? Answer in letters.\\nOkay, so I need to figure out the capital of Japan and its population, and then provide some additional information. The user has also specified the format, but it\'s not entirely clear from the start. Let me parse that again.\\n\\nThe user wrote: \\"What is the capital of Japan? How about its population? What else? Answer in letters.\\"\\n\\nHmm, that\'s a bit vague. The \\"Answer in letters\\" suggests that the"}]'