In [1]:
from kubernetes.client import V1PodTemplateSpec
from kubernetes.client import V1ObjectMeta
from kubernetes.client import V1PodSpec
from kubernetes.client import V1Container

from kubeflow.training import constants
from kubeflow.training.utils import utils
from kubeflow.training import V1ReplicaSpec
from kubeflow.training import KubeflowOrgV1TFJob
from kubeflow.training import KubeflowOrgV1TFJobSpec
from kubeflow.training import V1RunPolicy
from kubeflow.training import TFJobClient

In [2]:
namespace = utils.get_default_target_namespace()

In [3]:
job_name = 'mnist-test-01'

tfjob_client = TFJobClient(config_file='config')

In [4]:
container = V1Container(
    name="tfjob",
    image="duxucloud/gcr.io_kubeflow-ci_tf-mnist-with-summaries:1.0",
    command=[
        "python",
        "/var/tf_mnist/mnist_with_summaries.py",
        "--log_dir=/train/logs", "--learning_rate=0.01",
        "--batch_size=150"
        ]
)

worker = V1ReplicaSpec(
    replicas=2,
    restart_policy="Never",
    template=V1PodTemplateSpec(
        spec=V1PodSpec(
            containers=[container]
        )
    )
)

chief = V1ReplicaSpec(
    replicas=1,
    restart_policy="Never",
    template=V1PodTemplateSpec(
        spec=V1PodSpec(
            containers=[container]
        )
    )
)

ps = V1ReplicaSpec(
    replicas=1,
    restart_policy="Never",
    template=V1PodTemplateSpec(
        spec=V1PodSpec(
            containers=[container]
        )
    )
)

tfjob = KubeflowOrgV1TFJob(
    api_version="kubeflow.org/v1",
    kind="TFJob",
    metadata=V1ObjectMeta(name=job_name,namespace=namespace),
    spec=KubeflowOrgV1TFJobSpec(
        run_policy=V1RunPolicy(clean_pod_policy="None"),
        tf_replica_specs={"Worker": worker,
                          "Chief": chief,
                          "PS": ps}
    )
)

In [5]:
tfjob_client = TFJobClient(config_file='config')
tfjob_client.create(tfjob, namespace=namespace)

{'apiVersion': 'kubeflow.org/v1',
 'kind': 'TFJob',
 'metadata': {'creationTimestamp': '2023-04-07T08:04:06Z',
  'generation': 1,
  'managedFields': [{'apiVersion': 'kubeflow.org/v1',
    'fieldsType': 'FieldsV1',
    'fieldsV1': {'f:spec': {'.': {},
      'f:runPolicy': {'.': {}, 'f:cleanPodPolicy': {}},
      'f:tfReplicaSpecs': {'.': {},
       'f:Chief': {'.': {},
        'f:replicas': {},
        'f:restartPolicy': {},
        'f:template': {'.': {}, 'f:spec': {'.': {}, 'f:containers': {}}}},
       'f:PS': {'.': {},
        'f:replicas': {},
        'f:restartPolicy': {},
        'f:template': {'.': {}, 'f:spec': {'.': {}, 'f:containers': {}}}},
       'f:Worker': {'.': {},
        'f:replicas': {},
        'f:restartPolicy': {},
        'f:template': {'.': {}, 'f:spec': {'.': {}, 'f:containers': {}}}}}}},
    'manager': 'OpenAPI-Generator',
    'operation': 'Update',
    'time': '2023-04-07T08:04:06Z'}],
  'name': 'mnist-test-01',
  'namespace': 'default',
  'resourceVersion': '

In [6]:
tfjob_client.get_job_status('mnist', namespace=namespace)


'Succeeded'

In [7]:
tfjob_client.get_logs('mnist', namespace=namespace)


The logs of Pod mnist-chief-0:
Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
Instructions for updating:
Please write your own downloading logic.
Instructions for updating:
Please use urllib or similar directly.
Instructions for updating:
Please use tf.data to implement this functionality.
Instructions for updating:
Please use tf.data to implement this functionality.
Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
2023-03-30 07:57:04.564094: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 AVX512F FMA
Successfully downloaded train-images-idx3-ubyte.gz 9912422 bytes.
Extracting /tmp/tensorflow/mnist/input_data/train-images-idx3-ubyte.gz
Successfully downloaded train-labels-idx1-ubyte.gz 28881 bytes.
Extracting /tmp/tensorflow/mnist/input_data/train-labels-idx1-ubyte.gz
Succe

In [8]:
tfjob_client.delete(job_name, namespace=namespace)

{'kind': 'Status',
 'apiVersion': 'v1',
 'metadata': {},
 'status': 'Success',
 'details': {'name': 'mnist-test-01',
  'group': 'kubeflow.org',
  'kind': 'tfjobs',
  'uid': '207bd0ef-1a4f-4e55-b450-5c52384339da'}}