# Custom Training Job with Custom Containers

In [6]:
import os
import pprint
import sys
import time

from google.cloud import aiplatform

## Set environment constants

In [7]:
PROJECT_ID = 'jk-mlops-dev'
REGION = 'us-central1'
STAGING_BUCKET = 'gs://jk-vertex-workshop-bucket'

## Prepare and test a training container

### Create a Dockerfile

In [8]:
BASE_IMAGE = 'gcr.io/deeplearning-platform-release/tf2-gpu.2-5'
MODEL_GARDEN_VERSION = '2.5.0'
TRAIN_IMAGE = f'gcr.io/{PROJECT_ID}/model_garden'
TF_TEXT='2.5.0'

dockerfile = f'''
FROM {BASE_IMAGE}

RUN pip install tf-models-official=={MODEL_GARDEN_VERSION} tensorflow-text=={TF_TEXT}

WORKDIR /

# Copies the trainer code to the docker image.
COPY nlp-trainer /trainer

ENTRYPOINT ["python"]
CMD ["-c", "print('Hello')"]
'''

with open('Dockerfile', 'w') as f:
    f.write(dockerfile)

### Build a container image

In [9]:
! docker build -t {TRAIN_IMAGE} .

Sending build context to Docker daemon  353.8kB
Step 1/6 : FROM gcr.io/deeplearning-platform-release/tf2-gpu.2-5
 ---> 950969e5619c
Step 2/6 : RUN pip install tf-models-official==2.5.0 tensorflow-text==2.5.0
 ---> Using cache
 ---> 8b4eaa89170c
Step 3/6 : WORKDIR /
 ---> Using cache
 ---> 6ee5de13c4a8
Step 4/6 : COPY nlp-trainer /trainer
 ---> 5781c222eded
Step 5/6 : ENTRYPOINT ["python"]
 ---> Running in 531968473baa
Removing intermediate container 531968473baa
 ---> 12d7ec3c8fb7
Step 6/6 : CMD ["-c", "print('Hello')"]
 ---> Running in e4bfca7d27ac
Removing intermediate container e4bfca7d27ac
 ---> ba0eda1c5e0d
Successfully built ba0eda1c5e0d
Successfully tagged gcr.io/jk-mlops-dev/model_garden:latest


### Push the container to Container Registry

In [10]:
! docker push {TRAIN_IMAGE}

Using default tag: latest
The push refers to repository [gcr.io/jk-mlops-dev/model_garden]

[1Bf56f092a: Preparing 
[1B12432222: Preparing 
[1B464d3f17: Preparing 
[1Bdaea14d2: Preparing 
[1Bb28de254: Preparing 
[1B52e30556: Preparing 
[1Bfc085027: Preparing 
[1B7d90a58d: Preparing 
[1B285b3362: Preparing 
[1B0730cb59: Preparing 
[1B18de1f93: Preparing 
[1Bd1dfb5d0: Preparing 
[1B686f5924: Preparing 
[1B5de2196f: Preparing 
[1B383a0e80: Preparing 
[1Beaf882b2: Preparing 
[1B2519572d: Preparing 
[1Bfbfba824: Preparing 
[1Ba8806df6: Preparing 
[1B2a1c8291: Preparing 
[1Bb49af22b: Preparing 
[1Bb363f69f: Preparing 
[1B0a9a6a11: Preparing 
[1B7e8b38e6: Preparing 
[1B8f196cf4: Preparing 
[1B01dbc7de: Preparing 
[1B31d2d72b: Preparing 
[1Ba966f459: Preparing 
[1Bb9e63cdf: Preparing 
[1B49f5bf51: Preparing 
[1Baa2fa9fe: Preparing 
[23B730cb59: Waiting g 
[1Bdd81f9fa: Preparing 
[33B2432222: Pushed   210.8MB/208.2MBg-platform-release/tf2-gpu.2-5 [33A[2K[31A

## Submit Vertext Training jobs

### Define helper functions

In [11]:
def prepare_worker_pool_specs(
    image_uri,
    args,
    cmd, 
    replica_count=1,
    machine_type='n1-standard-4',
    accelerator_count=0,
    accelerator_type='ACCELERATOR_TYPE_UNSPECIFIED'):

    if accelerator_count > 0:
        machine_spec = {
            'machine_type': machine_type,
            'accelerator_type': accelerator_type,
            'accelerator_count': accelerator_count,
        }
    else:
        machine_spec = {
            'machine_type': machine_type
        }
    
    container_spec = {
        'image_uri': image_uri,
        'args': args,
        'command': cmd,
    }
    
    chief_spec = {
        'replica_count': 1,
        'machine_spec': machine_spec,
        'container_spec': container_spec
    }

    worker_pool_specs = [chief_spec]
    if replica_count > 1:
        workers_spec = {
            'replica_count': replica_count - 1,
            'machine_spec': machine_spec,
            'container_spec': container_spec
        }
        worker_pool_specs.append(workers_spec)
    
    return worker_pool_specs

### Prepare worker pool specification

In [13]:
MNLI_TRAIN_SPLIT = 'gs://jk-vertex-demos/datasets/MNLI/mnli_train.tf_record'
MNLI_VALID_SPLIT = 'gs://jk-vertex-demos/datasets/MNLI/mnli_valid.tf_record'
BERT_HUB_URL = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/4'

job_name = "JOB_{}".format(time.strftime("%Y%m%d_%H%M%S"))
output_dir = f'gs://jk-vertex-demos/jobs'
model_dir = f'{output_dir}/{job_name}/model'
tfhub_cache_dir = f'{output_dir}/tfhub-cache'
config_file = 'trainer/glue_mnli_matched.yaml'
mode = 'train_and_eval'
experiment = 'bert/sentence_prediction'

machine_type = 'n1-standard-8'
accelerator_count = 1
accelerator_type = 'NVIDIA_TESLA_T4'
strategy = 'multi_worker_mirrored'

replica_count = 2
global_batch_size = 32

params_override = [
    'task.train_data.input_path=' + MNLI_TRAIN_SPLIT,
    'task.validation_data.input_path=' + MNLI_VALID_SPLIT,
    'task.train_data.global_batch_size=' + str(global_batch_size),
    'task.validation_data.global_batch_size=' + str(global_batch_size),
    'task.hub_module_url=' + BERT_HUB_URL,
    'runtime.num_gpus=' + str(accelerator_count),
    'runtime.distribution_strategy=' + strategy,
    'runtime.all_reduce_alg=' + 'nccl',
]


cmd = [
    "python", "trainer/train.py"
]
args = [
    '--experiment=' + experiment,
    '--mode=' + mode,
    '--model_dir=' + model_dir,
    '--config_file=' + config_file,
    '--tfhub_cache_dir=' + tfhub_cache_dir,
    '--params_override=' + ','.join(params_override),
]

worker_pool_specs = prepare_worker_pool_specs(
    image_uri=TRAIN_IMAGE,
    args=args,
    cmd=cmd,
    replica_count=replica_count,
    machine_type=machine_type,
    accelerator_count=accelerator_count,
    accelerator_type=accelerator_type
)

pp = pprint.PrettyPrinter()
print(pp.pformat(worker_pool_specs))

[{'container_spec': {'args': ['--experiment=bert/sentence_prediction',
                              '--mode=train_and_eval',
                              '--model_dir=gs://jk-vertex-demos/jobs/JOB_20210602_003422/model',
                              '--config_file=trainer/glue_mnli_matched.yaml',
                              '--tfhub_cache_dir=gs://jk-vertex-demos/jobs/tfhub-cache',
                              '--params_override=task.train_data.input_path=gs://jk-vertex-demos/datasets/MNLI/mnli_train.tf_record,task.validation_data.input_path=gs://jk-vertex-demos/datasets/MNLI/mnli_valid.tf_record,task.train_data.global_batch_size=32,task.validation_data.global_batch_size=32,task.hub_module_url=https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/4,runtime.num_gpus=1,runtime.distribution_strategy=multi_worker_mirrored,runtime.all_reduce_alg=nccl'],
                     'command': ['python', 'trainer/train.py'],
                     'image_uri': 'gcr.io/jk-mlops-dev/model_

### Submit and monitor the job

In [14]:
aiplatform.init(
    project=PROJECT_ID,
    location=REGION,
    staging_bucket=STAGING_BUCKET
)

In [15]:
display_name = job_name

job = aiplatform.CustomJob(
    display_name=display_name,
    worker_pool_specs=worker_pool_specs,
)

job.run(sync=False)

INFO:google.cloud.aiplatform.jobs:Creating CustomJob
INFO:google.cloud.aiplatform.jobs:CustomJob created. Resource name: projects/895222332033/locations/us-central1/customJobs/4909490941833773056
INFO:google.cloud.aiplatform.jobs:To use this CustomJob in another session:
INFO:google.cloud.aiplatform.jobs:custom_job = aiplatform.CustomJob.get('projects/895222332033/locations/us-central1/customJobs/4909490941833773056')
INFO:google.cloud.aiplatform.jobs:View Custom Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/4909490941833773056?project=895222332033
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-central1/customJobs/4909490941833773056 current state:
JobState.JOB_STATE_QUEUED
