## Sample for Submarine Experiment SDK

The notebook shows how to use Submarine Experiment SDK to create, get, list, log, delete Submarine Experiment.

In [1]:
from __future__ import print_function
import submarine
from submarine.experiment.models.environment_spec import EnvironmentSpec
from submarine.experiment.models.experiment_spec import ExperimentSpec
from submarine.experiment.models.experiment_task_spec import ExperimentTaskSpec
from submarine.experiment.models.experiment_meta import ExperimentMeta
from submarine.experiment.models.code_spec import CodeSpec

### Create Submarine Client

In [None]:
# Host is "<SUBMARINE_SERVER_DNS_NAME>:<SUBMARINE_SERVER_PORT>" by default
# If you don't wanna use Jupyter notebook which generated by Submarine,
# please set the two environment variables "SUBMARINE_SERVER_DNS_NAME"
# (e.q. localhost) and "SUBMARINE_SERVER_PORT" (e.q. 8080).

In [2]:
submarine_client = submarine.ExperimentClient()

### Define TensorFlow experiment spec¶
Define Submarine spec¶
The demo only creates a PS and worker of TF experiment to run mnist sample.

In [3]:
environment = EnvironmentSpec(image="apache/submarine:tf-dist-mnist-test-1.0")
experiment_meta = ExperimentMeta(
    name="mnist-dist",
    namespace="default",
    framework="Tensorflow",
    cmd="python /var/tf_dist_mnist/dist_mnist.py --train_steps=100",
    env_vars={"ENV1": "ENV1"},
)

worker_spec = ExperimentTaskSpec(resources="cpu=1,memory=1024M", replicas=1)
ps_spec = ExperimentTaskSpec(resources="cpu=1,memory=1024M", replicas=1)
code_spec = CodeSpec(sync_mode="git", url="https://github.com/apache/submarine.git")

experiment_spec = ExperimentSpec(
    meta=experiment_meta,
    environment=environment,
    code=code_spec,
    spec={"Ps": ps_spec, "Worker": worker_spec},
)

### Create experiment

In [4]:
experiment = submarine_client.create_experiment(experiment_spec=experiment_spec)
experiment

{'experimentId': 'experiment_1601021036429_0013',
 'name': 'mnist-dist',
 'uid': 'fdee35f9-7877-4f59-8b19-c83fe3635408',
 'status': 'Accepted',
 'acceptedTime': '2020-09-25T16:52:17.000+08:00',
 'createdTime': None,
 'runningTime': None,
 'finishedTime': None,
 'spec': {'meta': {'name': 'mnist-dist',
   'namespace': 'default',
   'framework': 'Tensorflow',
   'cmd': 'python /var/tf_dist_mnist/dist_mnist.py --train_steps=100',
   'envVars': {'ENV1': 'ENV1'}},
  'environment': {'name': None,
   'dockerImage': None,
   'kernelSpec': None,
   'description': None,
   'image': 'apache/submarine:tf-dist-mnist-test-1.0'},
  'spec': {'Ps': {'replicas': 1,
    'resources': 'cpu=1,memory=1024M',
    'name': None,
    'image': None,
    'cmd': None,
    'envVars': None,
    'resourceMap': {'memory': '1024M', 'cpu': '1'}},
   'Worker': {'replicas': 1,
    'resources': 'cpu=1,memory=1024M',
    'name': None,
    'image': None,
    'cmd': None,
    'envVars': None,
    'resourceMap': {'memory': '1024

### Get the created experiment

In [5]:
id = experiment["experimentId"]
submarine_client.get_experiment(id)

{'experimentId': 'experiment_1601021036429_0013',
 'name': 'mnist-dist',
 'uid': 'fdee35f9-7877-4f59-8b19-c83fe3635408',
 'status': 'Created',
 'acceptedTime': '2020-09-25T16:52:17.000+08:00',
 'createdTime': '2020-09-25T16:52:17.000+08:00',
 'runningTime': None,
 'finishedTime': None,
 'spec': {'meta': {'name': 'mnist-dist',
   'namespace': 'default',
   'framework': 'Tensorflow',
   'cmd': 'python /var/tf_dist_mnist/dist_mnist.py --train_steps=100',
   'envVars': {'ENV1': 'ENV1'}},
  'environment': {'name': None,
   'dockerImage': None,
   'kernelSpec': None,
   'description': None,
   'image': 'apache/submarine:tf-dist-mnist-test-1.0'},
  'spec': {'Ps': {'replicas': 1,
    'resources': 'cpu=1,memory=1024M',
    'name': None,
    'image': None,
    'cmd': None,
    'envVars': None,
    'resourceMap': {'memory': '1024M', 'cpu': '1'}},
   'Worker': {'replicas': 1,
    'resources': 'cpu=1,memory=1024M',
    'name': None,
    'image': None,
    'cmd': None,
    'envVars': None,
    'reso

### List all running experiments

In [8]:
status = "running"
submarine_client.list_experiments(status=status)

[{'experimentId': 'experiment_1601021036429_0013',
  'name': 'mnist-dist',
  'uid': 'fdee35f9-7877-4f59-8b19-c83fe3635408',
  'status': 'Running',
  'acceptedTime': '2020-09-25T16:52:17.000+08:00',
  'createdTime': '2020-09-25T16:52:17.000+08:00',
  'runningTime': '2020-09-25T16:53:19.000+08:00',
  'finishedTime': None,
  'spec': {'meta': {'name': 'mnist-dist',
    'namespace': 'default',
    'framework': 'Tensorflow',
    'cmd': 'python /var/tf_dist_mnist/dist_mnist.py --train_steps=100',
    'envVars': {'ENV1': 'ENV1'}},
   'environment': {'name': None,
    'dockerImage': None,
    'kernelSpec': None,
    'description': None,
    'image': 'apache/submarine:tf-dist-mnist-test-1.0'},
   'spec': {'Ps': {'replicas': 1,
     'resources': 'cpu=1,memory=1024M',
     'name': None,
     'image': None,
     'cmd': None,
     'envVars': None,
     'resourceMap': {'memory': '1024M', 'cpu': '1'}},
    'Worker': {'replicas': 1,
     'resources': 'cpu=1,memory=1024M',
     'name': None,
     'image

### Wait for the experiment to finish

In [9]:
submarine_client.wait_for_finish(id)

  from ._conv import register_converters as _register_converters
2020-09-25 08:53:11.824375: I tensorflow/core/platform/cpu_feature_guard.cc:137] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA
2020-09-25 08:53:11.832165: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job ps -> {0 -> localhost:2222}
2020-09-25 08:53:11.832195: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job worker -> {0 -> mnist-dist-worker-0.default.svc:2222}
2020-09-25 08:53:11.878806: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:324] Started server with target: grpc://localhost:2222


### Get specific experiment training log

In [10]:
submarine_client.get_log(id)

The logs of Pod mnist-dist-ps-0:

  from ._conv import register_converters as _register_converters
2020-09-25 08:53:11.824375: I tensorflow/core/platform/cpu_feature_guard.cc:137] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA
2020-09-25 08:53:11.832165: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job ps -> {0 -> localhost:2222}
2020-09-25 08:53:11.832195: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job worker -> {0 -> mnist-dist-worker-0.default.svc:2222}
2020-09-25 08:53:11.878806: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:324] Started server with target: grpc://localhost:2222
The logs of Pod mnist-dist-worker-0:

  from ._conv import register_converters as _register_converters
2020-09-25 08:53:21.614236: I tensorflow/core/platform/cpu_feature_guard.cc:137] Your CPU supports instructions that this Tens

### Delete the experiment

In [11]:
submarine_client.delete_experiment(id)

{'experimentId': 'experiment_1601021036429_0013',
 'name': 'mnist-dist',
 'uid': 'fdee35f9-7877-4f59-8b19-c83fe3635408',
 'status': 'Deleted',
 'acceptedTime': '2020-09-25T16:52:17.000+08:00',
 'createdTime': '2020-09-25T16:52:17.000+08:00',
 'runningTime': '2020-09-25T16:53:19.000+08:00',
 'finishedTime': '2020-09-25T16:53:54.000+08:00',
 'spec': {'meta': {'name': 'mnist-dist',
   'namespace': 'default',
   'framework': 'Tensorflow',
   'cmd': 'python /var/tf_dist_mnist/dist_mnist.py --train_steps=100',
   'envVars': {'ENV1': 'ENV1'}},
  'environment': {'name': None,
   'dockerImage': None,
   'kernelSpec': None,
   'description': None,
   'image': 'apache/submarine:tf-dist-mnist-test-1.0'},
  'spec': {'Ps': {'replicas': 1,
    'resources': 'cpu=1,memory=1024M',
    'name': None,
    'image': None,
    'cmd': None,
    'envVars': None,
    'resourceMap': {'memory': '1024M', 'cpu': '1'}},
   'Worker': {'replicas': 1,
    'resources': 'cpu=1,memory=1024M',
    'name': None,
    'image':