# Custom training job on Vertex AI

[Autopackaging](https://cloud.google.com/vertex-ai/docs/training/create-custom-job#autopackaging)

```bash
gcloud auth configure-docker
sudo chmod 666 /var/run/docker.sock
```

In [1]:
!gcloud auth configure-docker

/bin/bash: gcloud: command not found


In [4]:
!gcloud ai custom-jobs create \
  --region=europe-west4 \
  --display-name=custom_job_haba \
  --worker-pool-spec=machine-type=n1-standard-4,replica-count=1,executor-image-uri=europe-docker.pkg.dev/vertex-ai/training/pytorch-gpu.1-9:latest,local-package-path=../,script=src/trainer/task.py \
  --args=--project=bence-bial-sandbox \
  --args=--bucket=haba-ws \
  --args=--output-path=output-debug \
  --args=--data=gs://haba-ws/data.csv \
  --args=--batch-size=16 \
  --args=--epochs=10 \
  --args=--eval-steps=1000

/bin/bash: gcloud: command not found


Format custom args based on [this noteboob](https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/matching_engine/two-tower-model-introduction.ipynb)

## Package application

```sh
python setup.py sdist --formats=gztar
gsutil cp dist/text-classification-0.1.tar.gz gs://haba-ws/container/
```


## Custom train job

In [1]:
from trainer.job import run_training_job

run_training_job(
    data="gs://haba-ws/data.csv",
    epochs=10,
    project="bence-bial-sandbox",
    bucket="haba-ws",
    output_path="output-debug",
    batch_size=16,
    eval_steps=500,
    early_stopping_patience=5,
    n_gpu=4,
    region="europe-west4",
    api_endpoint="europe-west4-aiplatform.googleapis.com",
    package_uri="gs://haba-ws/container/text-classification-0.1.tar.gz"
)

response: name: "projects/637644797917/locations/europe-west4/customJobs/149324674168258560"
display_name: "train_text_classifier_2022_01_03_09_43_55"
job_spec {
  worker_pool_specs {
    machine_spec {
      machine_type: "n1-standard-4"
      accelerator_type: NVIDIA_TESLA_V100
      accelerator_count: 4
    }
    replica_count: 1
    disk_spec {
      boot_disk_type: "pd-ssd"
      boot_disk_size_gb: 100
    }
    python_package_spec {
      executor_image_uri: "europe-docker.pkg.dev/vertex-ai/training/pytorch-gpu.1-9:m82"
      package_uris: "gs://haba-ws/container/text-classification-0.1.tar.gz"
      python_module: "trainer.task"
      args: "--epochs=10"
      args: "--bucket=haba-ws"
      args: "--project=bence-bial-sandbox"
      args: "--data=gs://haba-ws/data.csv"
      args: "--output-path=output-debug"
      args: "--batch-size=16"
      args: "--epochs=10"
      args: "--eval-steps=500"
      args: "--early-stopping-patience=5"
    }
  }
}
state: JOB_STATE_PENDING
create

name: "projects/637644797917/locations/europe-west4/customJobs/149324674168258560"
display_name: "train_text_classifier_2022_01_03_09_43_55"
job_spec {
  worker_pool_specs {
    machine_spec {
      machine_type: "n1-standard-4"
      accelerator_type: NVIDIA_TESLA_V100
      accelerator_count: 4
    }
    replica_count: 1
    disk_spec {
      boot_disk_type: "pd-ssd"
      boot_disk_size_gb: 100
    }
    python_package_spec {
      executor_image_uri: "europe-docker.pkg.dev/vertex-ai/training/pytorch-gpu.1-9:m82"
      package_uris: "gs://haba-ws/container/text-classification-0.1.tar.gz"
      python_module: "trainer.task"
      args: "--epochs=10"
      args: "--bucket=haba-ws"
      args: "--project=bence-bial-sandbox"
      args: "--data=gs://haba-ws/data.csv"
      args: "--output-path=output-debug"
      args: "--batch-size=16"
      args: "--epochs=10"
      args: "--eval-steps=500"
      args: "--early-stopping-patience=5"
    }
  }
}
state: JOB_STATE_PENDING
create_time {
  

In [None]:
python trainer/task.py --data=gs://haba-ws/data.csv --epochs=1 --project=bence-bial-sandbox --bucket=haba-ws --output-path=output-debug --batch-size=1 --eval-steps=50