## HyperPod Training SDK Experience

In [1]:
import sys
import warnings

warnings.filterwarnings("ignore")
sys.path.insert(0, '/Users/pintaoz/workspace/private-sagemaker-hyperpod-cli-staging/sagemaker-hyperpod/src/sagemaker')
sys.path

['/Users/pintaoz/workspace/private-sagemaker-hyperpod-cli-staging/sagemaker-hyperpod/src/sagemaker',
 '/Users/pintaoz/.pyenv/versions/3.10.14/lib/python310.zip',
 '/Users/pintaoz/.pyenv/versions/3.10.14/lib/python3.10',
 '/Users/pintaoz/.pyenv/versions/3.10.14/lib/python3.10/lib-dynload',
 '',
 '/Users/pintaoz/.pyenv/versions/3.10.14/envs/py3.10/lib/python3.10/site-packages']

### Create a HyperPodTrainingJob with a full spec

In [1]:
from sagemaker.hyperpod.training.hyperpod_pytorch_job import HyperPodPytorchJob
from sagemaker.hyperpod.training.config.hyperpod_pytorch_job_config import (
    _HyperPodPytorchJob,
    Container,
    ReplicaSpec,
    Resources,
    RunPolicy,
    Spec,
    Template,
)
from sagemaker.hyperpod.inference.config.common import Metadata

spec = _HyperPodPytorchJob(
    nproc_per_node="1",
    replica_specs=[
        ReplicaSpec(
            name="pod",
            template=Template(
                spec=Spec(
                    containers=[
                        Container(
                            name="container-name",
                            image="448049793756.dkr.ecr.us-west-2.amazonaws.com/ptjob:mnist",
                            image_pull_policy="Always",
                            resources=Resources(
                                requests={"nvidia.com/gpu": "0"},
                                limits={"nvidia.com/gpu": "0"},
                            ),
                            # command=[]
                        )
                    ]
                )
            ),
        )
    ],
    run_policy=RunPolicy(clean_pod_policy="None"),
)

pytorch_job = HyperPodPytorchJob(metadata=Metadata(name="demo"), spec=spec)
pytorch_job.create()


Deploying HyperPodPytorchJob with config:
apiVersion: sagemaker.amazonaws.com/v1
kind: HyperPodPyTorchJob
metadata:
  labels: null
  name: demo
  namespace: default
spec:
  nprocPerNode: '1'
  replicaSpecs:
  - name: pod
    replicas: 1
    spares: 0
    template:
      metadata: null
      spec:
        activeDeadlineSeconds: null
        affinity: null
        automountServiceAccountToken: null
        containers:
        - args: null
          command: null
          env: null
          envFrom: null
          image: 448049793756.dkr.ecr.us-west-2.amazonaws.com/ptjob:mnist
          imagePullPolicy: Always
          lifecycle: null
          livenessProbe: null
          name: container-name
          ports: null
          readinessProbe: null
          resizePolicy: null
          resources:
            claims: null
            limits:
              nvidia.com/gpu: '0'
            requests:
              nvidia.com/gpu: '0'
          restartPolicy: null
          securityContext: n

Exception: Resource 'demo' already exists in 'default'.

### Get the status of created jobs

In [None]:
import yaml
print("List all jobs:")
print(yaml.dump(HyperPodPytorchJob.list()))

print("Refresh job demo and check the status:")
pytorch_job.refresh()
print(yaml.dump(pytorch_job.status))


List all jobs:
- !!python/object:sagemaker.hyperpod.training.hyperpod_pytorch_job.HyperPodPytorchJob
  __dict__:
    apiVersion: sagemaker.amazonaws.com/v1
    kind: HyperPodPyTorchJob
    metadata: !!python/object:sagemaker.hyperpod.inference.config.common.Metadata
      __dict__:
        labels: null
        name: create-from-spec-demo-5
        namespace: default
      __pydantic_extra__: null
      __pydantic_fields_set__: !!set
        name: null
        namespace: null
      __pydantic_private__: null
    spec: !!python/object:sagemaker.hyperpod.training.config.hyperpod_pytorch_job_config._HyperPodPytorchJob
      __dict__:
        nprocPerNode: '1'
        replicaSpecs:
        - !!python/object:sagemaker.hyperpod.training.config.hyperpod_pytorch_job_config.ReplicaSpec
          __dict__:
            name: pod
            replicas: 1
            spares: 0
            template: !!python/object:sagemaker.hyperpod.training.config.hyperpod_pytorch_job_config.Template
              

### Get model training logs from the pod

In [3]:
print("List all pods created for this job:")
print(pytorch_job.list_pods())

print("Check the logs from pod0:")
print(pytorch_job.get_logs_from_pod("demo-pod-0"))

List all pods created for this job:
['demo-pod-0', 'demo-podmanagers-0']
Check the logs from pod0:
2025-06-29T21:45:26.710683365Z [HyperPodElasticAgent] 2025-06-29 21:45:26,710 [INFO] [rank0-restart0] /opt/conda/lib/python3.11/site-packages/amzn_hyper_pod_elastic_agent/run.py:68: Agent Version: 1.1.20250621203754
2025-06-29T21:45:26.711332626Z INFO:Log Agent:Log agent is ready, log agent logs will be saved to /tmp/log_agent_8g5qj5q8.log
2025-06-29T21:45:26.711777464Z [HyperPodElasticAgent] 2025-06-29 21:45:26,711 [INFO] [rank0-restart0] /opt/conda/lib/python3.11/site-packages/amzn_hyper_pod_elastic_agent/ipc/socket.py:161: Server started on /tmp/hyperpod_elastic_agent_1.sock
2025-06-29T21:45:26.711788125Z [HyperPodElasticAgent] 2025-06-29 21:45:26,711 [INFO] [rank0-restart0] /opt/conda/lib/python3.11/site-packages/amzn_hyper_pod_elastic_agent/ipc/server.py:66: Started CheckpointDiscoverySocketServer over /tmp/hyperpod_elastic_agent_1.sock
2025-06-29T21:45:26.711925367Z [HyperPodElastic

### Delete the job

In [None]:
pytorch_job.delete()

Successful deleted HyperpodPytorchJob!
