## Inference Operator PySDK E2E Expereience (JumpStart model)

<b>Prerequisite:</b> Data scientists should list clusters and set cluster context

In [1]:
from sagemaker.hyperpod.hyperpod_manager import HyperPodManager

In [2]:
HyperPodManager.list_clusters(region='us-east-2')

Orchestrator    Cluster Name
--------------  ----------------------------
EKS             hp-cluster-for-inf-Beta2try1


In [3]:
# choose the HP cluster user works on
HyperPodManager.set_context('hp-cluster-for-inf-Beta2try1', region='us-east-2')

Updated context arn:aws:eks:us-east-2:637423555983:cluster/EKSClusterForInf-Beta2try1 in /tmp/kubeconfig
Successfully set current cluster as: hp-cluster-for-inf-Beta2try1


### Create JumpStart model endpoint

#### Create from spec object

In [8]:
from sagemaker.hyperpod.inference.config.hp_jumpstart_endpoint_config import Model, Server,SageMakerEndpoint, TlsConfig, EnvironmentVariables
from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint
import yaml
import time

<b>Note:</b> We auto-generate config class definitions above using script, such as `Model`, `Server`, `SageMakerEndpoint`. This is based on [Inference CRD file](https://code.amazon.com/packages/AWSCrescendoInferenceOperator/blobs/mainline/--/dist/config/crd/inference.sagemaker.aws.amazon.com_jumpstartmodels.yaml).

In [9]:
# create configs
model=Model(
    model_id='deepseek-llm-r1-distill-qwen-1-5b',
    model_version='2.0.4',
)
server=Server(
    instance_type='ml.g5.8xlarge',
)
endpoint_name=SageMakerEndpoint(name='deepsek7bsme-testing-jumpstart-7-1')
tls_config=TlsConfig(tls_certificate_output_s3_uri='s3://tls-bucket-inf1-beta2')

# create spec
js_endpoint=HPJumpStartEndpoint(
    model=model,
    server=server,
    sage_maker_endpoint=endpoint_name,
    tls_config=tls_config,
)

In [None]:
# use spec to deploy
js_endpoint.create()

In [8]:
# poll status
t = 0
timeout = 600  # 600 seconds timeout  
interval = 15  # poll every 15 seconds

while t < timeout:
    # use refresh to fetch latest status
    js_endpoint.refresh()

    try:
        # deployment status will be available immediately
        deployment_status = js_endpoint.status.deploymentStatus.deploymentObjectOverallState
        if deployment_status== 'DeploymentFailed':
            print('Deployment failed!')
            break

        # endpoint status will appear be available from refresh() at some point
        endpoint_status = js_endpoint.status.endpoints.sagemaker.state
        if endpoint_status == 'CreationCompleted':
            print('Endpoint is available!')
            break
    except:
        pass

    time.sleep(interval)
    t += interval
    print('Refreshing instance status...')

if t >= timeout:
    print('Endpoint creation timed out!')

Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Refreshing instance status...
Endpoint is available!


In [9]:
js_endpoint.status.endpoints.sagemaker.state

'CreationCompleted'

In [10]:
# print endpoint in yaml
def print_yaml(endpoint):
    print(yaml.dump(endpoint.model_dump(exclude_none=True)))

In [11]:
# output is similar to kubectl get jumpstartmodels
endpoint_list = HPJumpStartEndpoint.list()
print_yaml(endpoint_list[0])

autoScalingSpec:
  cloudWatchTrigger:
    metricCollectionPeriod: 300
    metricCollectionStartTime: 300
    metricStat: Average
    metricType: Average
    minValue: 0.0
    useCachedMetrics: true
  cooldownPeriod: 300
  initialCooldownPeriod: 300
  maxReplicaCount: 5
  minReplicaCount: 1
  pollingInterval: 30
  prometheusTrigger:
    metricType: Average
    useCachedMetrics: true
  scaleDownStabilizationTime: 300
  scaleUpStabilizationTime: 0
maxDeployTimeInSeconds: 3600
metadata:
  name: deepseek-llm-r1-distill-qwen-1-5b
  namespace: default
model:
  acceptEula: false
  modelHubName: SageMakerPublicHub
  modelId: deepseek-llm-r1-distill-qwen-1-5b
  modelVersion: 2.0.4
replicas: 1
sageMakerEndpoint:
  name: deepsek7bsme-testing-jumpstart-7-1
server:
  instanceType: ml.g5.8xlarge
status:
  conditions:
  - lastTransitionTime: '2025-07-01T18:30:12Z'
    message: Deployment, ALB Creation or SageMaker endpoint registration creation
      for model is in progress
    reason: InProgress
   

In [12]:
# output is similar to kubectl describe jumpstartmodel
endpoint = HPJumpStartEndpoint.get(name='deepseek-llm-r1-distill-qwen-1-5b')
print_yaml(endpoint)

autoScalingSpec:
  cloudWatchTrigger:
    metricCollectionPeriod: 300
    metricCollectionStartTime: 300
    metricStat: Average
    metricType: Average
    minValue: 0.0
    useCachedMetrics: true
  cooldownPeriod: 300
  initialCooldownPeriod: 300
  maxReplicaCount: 5
  minReplicaCount: 1
  pollingInterval: 30
  prometheusTrigger:
    metricType: Average
    useCachedMetrics: true
  scaleDownStabilizationTime: 300
  scaleUpStabilizationTime: 0
maxDeployTimeInSeconds: 3600
metadata:
  name: deepseek-llm-r1-distill-qwen-1-5b
  namespace: default
model:
  acceptEula: false
  modelHubName: SageMakerPublicHub
  modelId: deepseek-llm-r1-distill-qwen-1-5b
  modelVersion: 2.0.4
replicas: 1
sageMakerEndpoint:
  name: deepsek7bsme-testing-jumpstart-7-1
server:
  instanceType: ml.g5.8xlarge
status:
  conditions:
  - lastTransitionTime: '2025-07-01T18:30:12Z'
    message: Deployment, ALB Creation or SageMaker endpoint registration creation
      for model is in progress
    reason: InProgress
   

### Invoke endpoint

In [13]:
# invoke
data='{"inputs":"What is the capital of USA?"}'

endpoint.invoke(body=data).body.read()





b'{"generated_text": " What is the capital of France? What is the capital of Japan? What is the capital of China? What is the capital of Germany? What is"}'

In [14]:
# get operator logs
print(js_endpoint.get_operator_logs(since_hours=1))

# get specific pod log
# js_endpoint.get_logs(pod='pod-name')

2025-07-01T17:54:29.409929402Z 2025-07-01T17:54:29Z	INFO	JumpStartModelReconciler	Reconciling JumpStartModelReconciler CR	{"request": {"name":"deepseek-llm-r1-distill-qwen-1-5b","namespace":"default"}, "Region": "us-east-2", "EXECUTION_ROLE_ARN": "arn:aws:iam::637423555983:role/EKSClusterForInf-Beta2try1-cluster-role"}
2025-07-01T17:54:29.424192970Z 2025-07-01T17:54:29Z	INFO	SageMakerEndpointRegistrationReconciler	Starting reconciliation	{"request": {"name":"deepsek7bsme-testing-zhaoqi-0630-jumpstart","namespace":"default"}}
2025-07-01T17:54:29.424223861Z 2025-07-01T17:54:29Z	INFO	SageMakerEndpointRegistrationReconciler	Resource found in the status	{"status": {"observedGeneration":1,"endpoint":{"arn":"arn:aws:sagemaker:us-east-2:637423555983:endpoint/deepsek7bsme-testing-zhaoqi-0630-jumpstart","configArn":"","modelArn":"","lastModifiedTime":"2025-06-30T21:39:20Z"},"loadBalancer":{"hostName":"internal-k8s-default-albdeeps-58143cf402-2030757287.us-east-2.elb.amazonaws.com"},"state":"Crea

In [15]:
# delete endpoint
endpoint.delete()