# VERTEX AI POC

1. Created new project in GCP.
2. Enable Vertex AI APIs, Compute Engine APIs, and Cloud Storage.

In [21]:
PROJECT_ID = "homedepot-345802"
REGION = "us-central1"
# set current project as project_id 
! gcloud config set project $PROJECT_ID

Updated property [core/project].


In [6]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

## Creating bucket to host data and model

In [22]:
# bucket we are going to use to host data and model
BUCKET_NAME = "gs://homedpot"
! gsutil mb -l $REGION $BUCKET_NAME

Creating gs://homedpot/...


In [9]:
#validate if you have access
! gsutil ls -al $BUCKET_NAME

      1493  2022-03-31T06:06:45Z  gs://homedpot/aiplatform-2022-03-31-02:06:45.075-aiplatform_custom_trainer_script-0.1.tar.gz#1648706805860842  metageneration=1
      1501  2022-03-31T06:07:20Z  gs://homedpot/aiplatform-2022-03-31-02:07:20.208-aiplatform_custom_trainer_script-0.1.tar.gz#1648706840954735  metageneration=1
      1888  2022-03-31T05:50:33Z  gs://homedpot/trainer_iris.tar.gz#1648705833240231  metageneration=1
                                 gs://homedpot/20220330233852/
TOTAL: 3 objects, 4882 bytes (4.77 KiB)


## Initializing AI Platform sdk 

In [10]:
import google.cloud.aiplatform as aip

aip.init(project=PROJECT_ID, staging_bucket=BUCKET_NAME)

INFO:numexpr.utils:Note: NumExpr detected 10 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.


## Define containers for train and deploy and machine type for the same

In [23]:
TRAIN_IMAGE = "gcr.io/cloud-aiplatform/training/xgboost-cpu.1-1:latest"
DEPLOY_IMAGE = "gcr.io/cloud-aiplatform/prediction/xgboost-cpu.1-1:latest"
TRAIN_COMPUTE = "n1-standard-4"
DEPLOY_COMPUTE = "n1-standard-4"


## Setting up training package 

In [25]:
# Make folder for Python training script
! rm -rf custom
! mkdir custom

# Add package information
! touch custom/README.md

setup_cfg = "[egg_info]\n\ntag_build =\n\ntag_date = 0"
! echo "$setup_cfg" > custom/setup.cfg

setup_py = "import setuptools\n\nsetuptools.setup(\n\n    install_requires=[\n\n        'tensorflow_datasets==1.3.0',\n\n    ],\n\n    packages=setuptools.find_packages())"
! echo "$setup_py" > custom/setup.py

pkg_info = "Metadata-Version: 1.0\n\nName: Iris tabular classification\n\nVersion: 0.0.0\n\nSummary: Demostration training script\n\nHome-page: www.google.com\n\nAuthor: Google\n\nAuthor-email: aferlitsch@google.com\n\nLicense: Public\n\nDescription: Demo\n\nPlatform: Vertex"
! echo "$pkg_info" > custom/PKG-INFO

# Make the training subfolder
! mkdir custom/trainer
! touch custom/trainer/__init__.py

In [26]:
%%writefile custom/trainer/task.py
# Single Instance Training for Iris

import datetime
import os
import subprocess
import sys
import pandas as pd
import xgboost as xgb

import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--model-dir', dest='model_dir',
                    default=os.getenv('AIP_MODEL_DIR'), type=str, help='Model dir.')
args = parser.parse_args()

# Download data
iris_data_filename = 'iris_data.csv'
iris_target_filename = 'iris_target.csv'
data_dir = 'gs://cloud-samples-data/ai-platform/iris'

# gsutil outputs everything to stderr so we need to divert it to stdout.
subprocess.check_call(['gsutil', 'cp', os.path.join(data_dir,
                                                    iris_data_filename),
                       iris_data_filename], stderr=sys.stdout)
subprocess.check_call(['gsutil', 'cp', os.path.join(data_dir,
                                                    iris_target_filename),
                       iris_target_filename], stderr=sys.stdout)


# Load data into pandas, then use `.values` to get NumPy arrays
iris_data = pd.read_csv(iris_data_filename).values
iris_target = pd.read_csv(iris_target_filename).values

# Convert one-column 2D array into 1D array for use with XGBoost
iris_target = iris_target.reshape((iris_target.size,))


# Load data into DMatrix object
dtrain = xgb.DMatrix(iris_data, label=iris_target)


# Train XGBoost model
bst = xgb.train({}, dtrain, 20)

# Export the classifier to a file
model_filename = 'model.bst'
bst.save_model(model_filename)

# Upload the saved model file to Cloud Storage
gcs_model_path = os.path.join(args.model_dir, model_filename)
subprocess.check_call(['gsutil', 'cp', model_filename, gcs_model_path],
    stderr=sys.stdout)

Writing custom/trainer/task.py


## Upload training package to our gcp bucket

In [27]:
! rm -f custom.tar custom.tar.gz
! tar cvf custom.tar custom
! gzip custom.tar
! gsutil cp custom.tar.gz $BUCKET_NAME/trainer_iris.tar.gz

a custom
a custom/PKG-INFO
a custom/.DS_Store
a custom/README.md
a custom/setup.py
a custom/setup.cfg
a custom/trainer
a custom/trainer/task.py
a custom/trainer/__init__.py
Copying file://custom.tar.gz [Content-Type=application/x-tar]...
/ [1 files][  1.8 KiB/  1.8 KiB]                                                
Operation completed over 1 objects/1.8 KiB.                                      


## Create custom training job and run it

In [13]:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'service_key.json'
job = aip.CustomTrainingJob(
    display_name="iris_" + TIMESTAMP,
    script_path="custom/trainer/task.py",
    container_uri=TRAIN_IMAGE,
    requirements=["gcsfs==0.7.1", "tensorflow-datasets==4.4"],
)

print(job)

<google.cloud.aiplatform.training_jobs.CustomTrainingJob object at 0x7fcca3d50f50>


In [14]:
MODEL_DIR = "{}/{}".format(BUCKET_NAME, TIMESTAMP)


job.run(
    replica_count=1, machine_type=TRAIN_COMPUTE, base_output_dir=MODEL_DIR, sync=True
)

MODEL_DIR = MODEL_DIR + "/model"
model_path_to_deploy = MODEL_DIR

INFO:google.cloud.aiplatform.utils.source_utils:Training script copied to:
gs://homedpot/aiplatform-2022-03-31-07:12:03.893-aiplatform_custom_trainer_script-0.1.tar.gz.
INFO:google.cloud.aiplatform.training_jobs:Training Output directory:
gs://homedpot/20220331071112 
INFO:google.cloud.aiplatform.training_jobs:View Training:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/3420224981999550464?project=559280923802
INFO:google.cloud.aiplatform.training_jobs:CustomTrainingJob projects/559280923802/locations/us-central1/trainingPipelines/3420224981999550464 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud.aiplatform.training_jobs:View backing custom job:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/5724537471027380224?project=559280923802
INFO:google.cloud.aiplatform.training_jobs:CustomTrainingJob projects/559280923802/locations/us-central1/trainingPipelines/3420224981999550464 current state:
PipelineState.PIPE

## Upload model to model resource 

In [15]:
model = aip.Model.upload(
    display_name="iris_" + TIMESTAMP,
    artifact_uri=MODEL_DIR,
    serving_container_image_uri=DEPLOY_IMAGE,
    sync=False,
)

model.wait()

INFO:google.cloud.aiplatform.models:Creating Model
INFO:google.cloud.aiplatform.models:Create Model backing LRO: projects/559280923802/locations/us-central1/models/8252446094923923456/operations/2106227291666251776
INFO:google.cloud.aiplatform.models:Model created. Resource name: projects/559280923802/locations/us-central1/models/8252446094923923456
INFO:google.cloud.aiplatform.models:To use this Model in another session:
INFO:google.cloud.aiplatform.models:model = aiplatform.Model('projects/559280923802/locations/us-central1/models/8252446094923923456')


## Deploy the model for online predictions. You can specify min/max nodes for scaling

In [16]:
DEPLOY_COMPUTE = "n1-standard-4"
DEPLOYED_NAME = "iris-" + TIMESTAMP

TRAFFIC_SPLIT = {"0": 100}

MIN_NODES = 1
MAX_NODES = 1

endpoint = model.deploy(
    deployed_model_display_name=DEPLOYED_NAME,
    traffic_split=TRAFFIC_SPLIT,
    machine_type=DEPLOY_COMPUTE,
    min_replica_count=MIN_NODES,
    max_replica_count=MAX_NODES,
)

INFO:google.cloud.aiplatform.models:Creating Endpoint
INFO:google.cloud.aiplatform.models:Create Endpoint backing LRO: projects/559280923802/locations/us-central1/endpoints/2958979304391704576/operations/6717913310093639680
INFO:google.cloud.aiplatform.models:Endpoint created. Resource name: projects/559280923802/locations/us-central1/endpoints/2958979304391704576
INFO:google.cloud.aiplatform.models:To use this Endpoint in another session:
INFO:google.cloud.aiplatform.models:endpoint = aiplatform.Endpoint('projects/559280923802/locations/us-central1/endpoints/2958979304391704576')
INFO:google.cloud.aiplatform.models:Deploying model to Endpoint : projects/559280923802/locations/us-central1/endpoints/2958979304391704576
INFO:google.cloud.aiplatform.models:Deploy Endpoint model backing LRO: projects/559280923802/locations/us-central1/endpoints/2958979304391704576/operations/3182587602607800320
INFO:google.cloud.aiplatform.models:Endpoint model deployed. Resource name: projects/55928092380

## Get some predictions

In [17]:
instances_list = [[1.4, 1.3, 5.1, 2.8]]

prediction = endpoint.predict(instances_list)
print(prediction)

Prediction(predictions=[2.045193195343018], deployed_model_id='440182883110354944', explanations=None)


## Project Cleanup & Deleting all the resources on GCP

In [19]:
endpoint.undeploy_all()

delete_all = True

if delete_all:
    # Delete the dataset using the Vertex dataset object
    try:
        if "dataset" in globals():
            dataset.delete()
    except Exception as e:
        print(e)

    # Delete the model using the Vertex model object
    try:
        if "model" in globals():
            model.delete()
    except Exception as e:
        print(e)

    # Delete the endpoint using the Vertex endpoint object
    try:
        if "endpoint" in globals():
            endpoint.delete()
    except Exception as e:
        print(e)

    # Delete the AutoML or Pipeline trainig job
    try:
        if "dag" in globals():
            dag.delete()
    except Exception as e:
        print(e)

    # Delete the custom trainig job
    try:
        if "job" in globals():
            job.delete()
    except Exception as e:
        print(e)

    # Delete the batch prediction job using the Vertex batch prediction object
    try:
        if "batch_predict_job" in globals():
            batch_predict_job.delete()
    except Exception as e:
        print(e)

    # Delete the hyperparameter tuning job using the Vertex hyperparameter tuning object
    try:
        if "hpt_job" in globals():
            hpt_job.delete()
    except Exception as e:
        print(e)

    if "BUCKET_NAME" in globals():
        ! gsutil rm -r $BUCKET_NAME

INFO:google.cloud.aiplatform.base:Deleting Model : projects/559280923802/locations/us-central1/models/8252446094923923456
INFO:google.cloud.aiplatform.base:Delete Model  backing LRO: projects/559280923802/locations/us-central1/operations/1309090157621673984
INFO:google.cloud.aiplatform.base:Model deleted. . Resource name: projects/559280923802/locations/us-central1/models/8252446094923923456
INFO:google.cloud.aiplatform.base:Deleting Endpoint : projects/559280923802/locations/us-central1/endpoints/2958979304391704576
INFO:google.cloud.aiplatform.base:Delete Endpoint  backing LRO: projects/559280923802/locations/us-central1/operations/5920776176049061888
INFO:google.cloud.aiplatform.base:Endpoint deleted. . Resource name: projects/559280923802/locations/us-central1/endpoints/2958979304391704576
INFO:google.cloud.aiplatform.base:Deleting CustomTrainingJob : projects/559280923802/locations/us-central1/trainingPipelines/3420224981999550464
INFO:google.cloud.aiplatform.base:Delete CustomTra