In [None]:
#####################################################################
#
#       aiplatform.HyperparameterTuningJob.from_local_script
#
#####################################################################

In [None]:
#####################################################################
#
# define the training script
#
#####################################################################

In [None]:
%%writefile task.py

import argparse
import pickle
import os, json

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

from google.cloud import storage
import hypertune


# parse args
parser = argparse.ArgumentParser()
parser.add_argument('--bucket_name', dest='bucket_name', default="", type=str, help = 'The GCS bucket to store model artifacts -> w/o gs://')
parser.add_argument('--max_depth', dest='max_depth', default=10, type=int, help = 'The maximum depth of the tree')
args = parser.parse_args()

# load the iris dataset
dataset = datasets.load_iris()

# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(dataset.data , dataset.target, test_size = 0.3, random_state = 7)

# fit model, passing in the params being tuned
model = XGBClassifier( max_depth = args.max_depth )
model.fit(X_train, y_train)

# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# report metric for hyperparameter tuning
hpt = hypertune.HyperTune()
hpt.report_hyperparameter_tuning_metric(
    hyperparameter_metric_tag='accuracy',
    metric_value=accuracy
)

# save model to disk
model_filename = "model.bst"
model.save_model(model_filename)

# save accuracy to disk
metrics_filename = "metrics.json"
with open(metrics_filename, "w") as f:
    f.write( json.dumps( {"accuracy" : accuracy } ) )
    
# Upload to GCS
storage_client = storage.Client()
model_directory = os.environ["AIP_MODEL_DIR"]

# the model
storage_path = os.path.join(model_directory, model_filename)
blob = storage.blob.Blob.from_string(storage_path, client=storage_client)
blob.upload_from_filename(model_filename)

# the accruacy
storage_path = os.path.join(model_directory, metrics_filename)
blob = storage.blob.Blob.from_string(storage_path, client=storage_client)
blob.upload_from_filename(metrics_filename)

In [None]:
#####################################################################
#
# kick off the custom training job
#
#####################################################################

In [None]:
from google.cloud import aiplatform
from google.cloud.aiplatform import hyperparameter_tuning as hpt
from datetime import datetime

In [None]:
# specify parameters
P = ! gcloud config list --format 'value(core.project)'
PROJECT_ID = P[0]
REGION = "us-central1"
BUCKET_NAME = f"bkt-{PROJECT_ID}-vpipelines"
BUCKET_PATH = f"gs://{BUCKET_NAME}"
PIPELINE_ROOT = f"{BUCKET_PATH}/pipeline_root"
PIPELINE_DATA = f"{BUCKET_PATH}/data"
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

In [None]:
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_PATH)

# create the custom job
job = aiplatform.CustomJob.from_local_script(
    display_name = f"vai_HyperparameterTuningJob_CustomJob_fromLocalScript_{TIMESTAMP}"
    , project = PROJECT_ID
    , location = REGION
    , script_path = "task.py"
    , container_uri = "us-docker.pkg.dev/vertex-ai/training/xgboost-cpu.1-1:latest"
    , requirements = ["gcsfs", "pip==22.3.1", "cloudml-hypertune"]
    , replica_count = 1
    , machine_type = "n1-standard-4"
    , accelerator_count = 0
    , args = [f"--bucket_name={BUCKET_NAME}"]
    , environment_variables = { 'MY_KEY': 'MY_VALUE' }
    , labels={'my_key': 'my_value'}
)

In [None]:
JOB_DISPLAY_NAME = f"vai_HyperparameterTuningJob_{TIMESTAMP}"

In [None]:
# hyperparameter job using the custom jobs
hpt_job = aiplatform.HyperparameterTuningJob(
    display_name = JOB_DISPLAY_NAME
    , custom_job = job
    , metric_spec = { "accuracy": "maximize", }
    , parameter_spec = { "max_depth": hpt.IntegerParameterSpec(min = 1, max = 5, scale = "linear"), }
    # The search algorithm to use: grid, random and None. 
    # If None is specified, the Vizier service (Bayesian) is used.
    , search_algorithm = None
    , max_trial_count = 2
    , parallel_trial_count=1
)

In [None]:
hpt_job.run(service_account = f"sa-vertex-pipelines@{PROJECT_ID}.iam.gserviceaccount.com")

In [None]:
# find the best trial
print( f"HPT JOB NAME: {hpt_job.display_name}")
print("")
best = (None, None, None, 0.0)
for trial in hpt_job.trials:
    # Keep track of the best outcome
    if float(trial.final_measurement.metrics[0].value) > best[3]:
        try:
            best = (
                trial.id,
                float(trial.parameters[0].value),
                float(trial.parameters[1].value),
                float(trial.final_measurement.metrics[0].value),
            )
        except:
            best = (
                trial.id,
                float(trial.parameters[0].value),
                None,
                float(trial.final_measurement.metrics[0].value),
            )

print(best)

In [None]:
# Find the best model
model_location = hpt_job.to_dict()['trialJobSpec']['baseOutputDirectory']['outputUriPrefix']
BEST_MODEL_DIR = f"{model_location}/{best[0]}/model"
! gsutil ls {BEST_MODEL_DIR}

In [None]:
# see info about the job
hpt_job.to_dict()