## Sample for Submarine Experiment SDK

The notebook shows how to use Submarine Experiment SDK to create, get, list, log, delete Submarine Experiment.

In [1]:
from __future__ import print_function
import submarine
from submarine.client.models.environment_spec import EnvironmentSpec
from submarine.client.models.experiment_spec import ExperimentSpec
from submarine.client.models.experiment_task_spec import ExperimentTaskSpec
from submarine.client.models.experiment_meta import ExperimentMeta
from submarine.client.models.code_spec import CodeSpec

### Create Submarine Client

In [2]:
# Host is "<SUBMARINE_SERVER_DNS_NAME>:<SUBMARINE_SERVER_PORT>" by default
# If you don't wanna use Jupyter notebook which generated by Submarine,
# please set the two environment variables "SUBMARINE_SERVER_DNS_NAME"
# (e.q. localhost) and "SUBMARINE_SERVER_PORT" (e.q. 8080).

In [3]:
submarine_client = submarine.ExperimentClient()

### Define TensorFlow experiment spec¶
Define Submarine spec¶
The demo only creates a PS and worker of TF experiment to run mnist sample.

In [4]:
environment = EnvironmentSpec(image="apache/submarine:tf-dist-mnist-test-1.0")
experiment_meta = ExperimentMeta(
    name="mnist-dist",
    namespace="default",
    framework="Tensorflow",
    cmd="python /var/tf_dist_mnist/dist_mnist.py --train_steps=100",
    env_vars={"ENV1": "ENV1"},
)

worker_spec = ExperimentTaskSpec(resources="cpu=1,memory=1024M", replicas=1)
ps_spec = ExperimentTaskSpec(resources="cpu=1,memory=1024M", replicas=1)
code_spec = CodeSpec(sync_mode="git", url="https://github.com/apache/submarine.git")

experiment_spec = ExperimentSpec(
    meta=experiment_meta,
    environment=environment,
    code=code_spec,
    spec={"Ps": ps_spec, "Worker": worker_spec},
)

### Create experiment

In [5]:
experiment = submarine_client.create_experiment(experiment_spec=experiment_spec)
experiment

{'experimentId': 'experiment-1640110858229-0006',
 'uid': '0306270d-fd79-4a42-bd50-943ad7f9a3a3',
 'status': 'Accepted',
 'acceptedTime': '2021-12-22T02:33:56.000+08:00',
 'createdTime': None,
 'runningTime': None,
 'finishedTime': None,
 'spec': {'meta': {'experimentId': 'experiment-1640110858229-0006',
   'name': 'mnist-dist',
   'namespace': 'default',
   'framework': 'Tensorflow',
   'cmd': 'python /var/tf_dist_mnist/dist_mnist.py --train_steps=100',
   'envVars': {'ENV1': 'ENV1'},
   'tags': []},
  'environment': {'name': None,
   'dockerImage': None,
   'kernelSpec': None,
   'description': None,
   'image': 'apache/submarine:tf-dist-mnist-test-1.0'},
  'spec': {'Ps': {'replicas': 1,
    'resources': 'cpu=1,memory=1024M',
    'name': None,
    'image': None,
    'cmd': None,
    'envVars': None,
    'resourceMap': {'memory': '1024M', 'cpu': '1'}},
   'Worker': {'replicas': 1,
    'resources': 'cpu=1,memory=1024M',
    'name': None,
    'image': None,
    'cmd': None,
    'envVars

### Get the created experiment

In [6]:
id = experiment["experimentId"]
submarine_client.get_experiment(id)

{'experimentId': 'experiment-1640110858229-0006',
 'uid': '0306270d-fd79-4a42-bd50-943ad7f9a3a3',
 'status': 'Created',
 'acceptedTime': '2021-12-22T02:33:56.000+08:00',
 'createdTime': '2021-12-22T02:33:56.000+08:00',
 'runningTime': None,
 'finishedTime': None,
 'spec': {'meta': {'experimentId': 'experiment-1640110858229-0006',
   'name': 'mnist-dist',
   'namespace': 'default',
   'framework': 'Tensorflow',
   'cmd': 'python /var/tf_dist_mnist/dist_mnist.py --train_steps=100',
   'envVars': {'ENV1': 'ENV1'},
   'tags': []},
  'environment': {'name': None,
   'dockerImage': None,
   'kernelSpec': None,
   'description': None,
   'image': 'apache/submarine:tf-dist-mnist-test-1.0'},
  'spec': {'Ps': {'replicas': 1,
    'resources': 'cpu=1,memory=1024M',
    'name': None,
    'image': None,
    'cmd': None,
    'envVars': None,
    'resourceMap': {'memory': '1024M', 'cpu': '1'}},
   'Worker': {'replicas': 1,
    'resources': 'cpu=1,memory=1024M',
    'name': None,
    'image': None,
   

### List all running experiments

In [7]:
status = "running"
submarine_client.list_experiments(status=status)

[]

### Wait for the experiment to finish

In [8]:
submarine_client.wait_for_finish(id)

  from ._conv import register_converters as _register_converters
2021-12-21 18:34:09.528066: I tensorflow/core/platform/cpu_feature_guard.cc:137] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA
2021-12-21 18:34:09.529126: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job ps -> {0 -> localhost:2222}
2021-12-21 18:34:09.529164: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job worker -> {0 -> experiment-1640110858229-0006-worker-0.default.svc:2222}
2021-12-21 18:34:09.529732: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:324] Started server with target: grpc://localhost:2222


### Get specific experiment training log

In [9]:
submarine_client.get_log(id)

The logs of Pod experiment-1640110858229-0006-ps-0:

  from ._conv import register_converters as _register_converters
2021-12-21 18:34:09.528066: I tensorflow/core/platform/cpu_feature_guard.cc:137] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA
2021-12-21 18:34:09.529126: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job ps -> {0 -> localhost:2222}
2021-12-21 18:34:09.529164: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job worker -> {0 -> experiment-1640110858229-0006-worker-0.default.svc:2222}
2021-12-21 18:34:09.529732: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:324] Started server with target: grpc://localhost:2222
The logs of Pod experiment-1640110858229-0006-worker-0:

  from ._conv import register_converters as _register_converters
2021-12-21 18:34:08.093849: I tensorflow/core/platform/cpu_feature_gu

### Delete the experiment

In [10]:
submarine_client.delete_experiment(id)

{'experimentId': 'experiment-1640110858229-0006',
 'uid': '0306270d-fd79-4a42-bd50-943ad7f9a3a3',
 'status': 'Deleted',
 'acceptedTime': None,
 'createdTime': None,
 'runningTime': None,
 'finishedTime': None,
 'spec': {'meta': {'experimentId': 'experiment-1640110858229-0006',
   'name': 'mnist-dist',
   'namespace': 'default',
   'framework': 'Tensorflow',
   'cmd': 'python /var/tf_dist_mnist/dist_mnist.py --train_steps=100',
   'envVars': {'ENV1': 'ENV1'},
   'tags': []},
  'environment': {'name': None,
   'dockerImage': None,
   'kernelSpec': None,
   'description': None,
   'image': 'apache/submarine:tf-dist-mnist-test-1.0'},
  'spec': {'Ps': {'replicas': 1,
    'resources': 'cpu=1,memory=1024M',
    'name': None,
    'image': None,
    'cmd': None,
    'envVars': None,
    'resourceMap': {'memory': '1024M', 'cpu': '1'}},
   'Worker': {'replicas': 1,
    'resources': 'cpu=1,memory=1024M',
    'name': None,
    'image': None,
    'cmd': None,
    'envVars': None,
    'resourceMap': 