In [None]:
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# AI Platform (Unified) SDK: Hyperparameter tuning for custom tabular model

<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/ai-platform-samples/blob/master/notebooks/templates/ai_platform_notebooks_template.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/ai-platform-samples/blob/master/notebooks/templates/ai_platform_notebooks_template.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
</table>

# Overview


This tutorial demonstrates how to use the AI Platform (Unified) Python SDK to do hyperparameter tuning for a custom tabular regression model.

### Dataset

The dataset used for this tutorial is the [Boston Housing Prices dataset](https://www.cs.toronto.edu/~delve/data/boston/bostonDetail.html). The version of the dataset you will use in this tutorial is built into Tensorflow. The trained model predicts the median price of a house in units of 1K USD.

### Objective

In this notebook, you will learn how to create a hyperparameter tuning job for a custom tabular regression model from a Python script in a docker container using the AI Platform (Unified) SDK, and then do a prediction on the deployed model. You can alternatively create hyperparameter tuning jobs for custom models from the command line using `gcloud` or online using Google Cloud Console.

The steps performed include: 

- Create an AI Platform (Unified) hyperparameter turning job for training a custom model.
- Tune the custom model.
- Evaluate the study results.


### Costs 

This tutorial uses billable components of Google Cloud Platform (GCP):

* Cloud AI Platform
* Cloud Storage

Learn about [Cloud AI Platform
pricing](https://cloud.google.com/ml-engine/docs/pricing) and [Cloud Storage
pricing](https://cloud.google.com/storage/pricing), and use the [Pricing
Calculator](https://cloud.google.com/products/calculator/)
to generate a cost estimate based on your projected usage.

## Installation

Install the latest (preview) version of AI Platform (Unified) SDK.

In [None]:
! pip3 install -U google-cloud-aiplatform --user

Install the Google `cloud-storage` library as well.

In [None]:
! pip3 install google-cloud-storage

### Restart the Kernel

Once you've installed the AI Platform (Unified) SDK, you need to restart the notebook kernel so it can find the packages.

In [None]:
import os
if not os.getenv("AUTORUN"):
    # Automatically restart kernel after installs
    import IPython
    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

## Before you begin

### GPU run-time

**Make sure you're running this notebook in a GPU runtime if you have that option. In Colab, select Runtime --> Change runtime type**

### Set up your GCP project

**The following steps are required, regardless of your notebook environment.**

1. [Select or create a GCP project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.

2. [Make sure that billing is enabled for your project.](https://cloud.google.com/billing/docs/how-to/modify-project)

3. [Enable the AI Platform APIs, Compute Engine APIs and Container Registry API.](https://console.cloud.google.com/flows/enableapi?apiid=ml.googleapis.com,compute_component,containerregistry.googleapis.com)

4. [Google Cloud SDK](https://cloud.google.com/sdk) is already installed in AI Platform Notebooks.

5. Enter your project ID in the cell below. Then run the  cell to make sure the
Cloud SDK uses the right project for all the commands in this notebook.

**Note**: Jupyter runs lines prefixed with `!` as shell commands, and it interpolates Python variables prefixed with `$` into these commands.

#### Project ID

**If you don't know your project ID**, you might be able to get your project ID using `gcloud` command by executing the second cell below.

In [None]:
PROJECT_ID = "[your-project-id]" #@param {type:"string"}

In [None]:
if PROJECT_ID == "" or PROJECT_ID is None or PROJECT_ID == "[your-project-id]":
    # Get your GCP project id from gcloud
    shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID:", PROJECT_ID)

In [None]:
! gcloud config set project $PROJECT_ID

#### Region

You can also change the `REGION` variable, which is used for operations
throughout the rest of this notebook.  Below are regions supported for AI Platform (Unified). We recommend when possible, to choose the region closest to you. 

- Americas: `us-central1`
- Europe: `europe-west4`
- Asia Pacific: `asia-east1`

You can not use a Multi-Regional Storage bucket for training with AI Platform. Not all regions provide support for all AI Platform services. For the lastest support per region, see [Region support for AI Platform (Unified) services](https://cloud.google.com/ai-platform-unified/docs/general/locations)

In [None]:
REGION = 'us-central1' #@param {type: "string"}

#### Timestamp

If you are in a live tutorial session, you might be using a shared test account or project. To avoid name collisions between users on resources created, you create a timestamp for each instance session, and append onto the name of resources which will be created in this tutorial.

In [None]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

### Authenticate your GCP account

**If you are using AI Platform Notebooks**, your environment is already
authenticated. Skip this step.

In [None]:
import os
import sys

# If you are running this notebook in Colab, run this cell and follow the
# instructions to authenticate your Google Cloud account. This provides access
# to your Cloud Storage bucket and lets you submit training jobs and prediction
# requests.

# If on AI Platform, then don't execute this code
if not os.path.exists('/opt/deeplearning/metadata/env_version'):
    if 'google.colab' in sys.modules:
        from google.colab import auth as google_auth
        google_auth.authenticate_user()

    # If you are running this tutorial in a notebook locally, replace the string
    # below with the path to your service account key and run this cell to
    # authenticate your Google Cloud account.
    else:
        %env GOOGLE_APPLICATION_CREDENTIALS your_path_to_credentials.json

### Create a Cloud Storage bucket

**The following steps are required, regardless of your notebook environment.**

When you submit a hyperparameter training job for a custom model using the Cloud SDK, you upload a Python package
containing your training code to a Cloud Storage bucket. AI Platform runs
the code from this package. In this tutorial, AI Platform also saves the
trained model that results from your job in the same bucket. You can then
create an AI Platform model version based on this output in order to serve
online predictions.

Set the name of your Cloud Storage bucket below. It must be unique across all Cloud Storage buckets. 

In [None]:
BUCKET_NAME = "[your-bucket-name]" #@param {type:"string"}

In [None]:
if BUCKET_NAME == "" or BUCKET_NAME is None or BUCKET_NAME == "[your-bucket-name]":
    BUCKET_NAME = PROJECT_ID + "ucaip-custom-" + TIMESTAMP

**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [None]:
! gsutil mb -l $REGION gs://$BUCKET_NAME

Finally, validate access to your Cloud Storage bucket by examining its contents:

In [None]:
! gsutil ls -al gs://$BUCKET_NAME

### Import libraries and define constants

#### Import AIP AI Platform (Unified) SDK

Import the AI Platform (Unified) SDK into your Python environment.

In [1]:
import os
import sys
import time

from google.cloud.aiplatform import gapic as aip

In [3]:
dir(aip.StudySpec.Algorithm)

['ALGORITHM_UNSPECIFIED',
 'GRID_SEARCH',
 'RANDOM_SEARCH',
 '__class__',
 '__doc__',
 '__members__',
 '__module__']

#### AI Platform (Unified) constants

Let's now setup some constants for AI Platform (Unified):

- `API_ENDPOINT`: The AI Platform (Unified) API service endpoint for dataset, model, job, pipeline and endpoint services.
- `PARENT`: The AI Platform (Unified) location root path for dataset, model and endpoint resources.

In [None]:
# API Endpoint
API_ENDPOINT = "{}-aiplatform.googleapis.com".format(REGION)

# AI Platform (Unified) location root path for your dataset, model and endpoint resources
PARENT = "projects/" + PROJECT_ID + "/locations/" + REGION

#### Hardware Accelerators

Let's now set the hardware accelerators (e.g., GPU), if any, for training and prediction.

Set the variables `TRAIN_GPU/TRAIN_NGPU` and `DEPLOY_GPU/DEPLOY_NGPU` to use a container image supporting a GPU and the number of GPUs allocated to the compute instance. For example, to use a GPU container image with 4 Nvidia Telsa K80 GPUs allocated to each compute instance, you would specify:

    (aip.AcceleratorType.NVIDIA_TESLA_K80, 4)

For GPU, available accelerators include:
   - aip.AcceleratorType.NVIDIA_TESLA_K80
   - aip.AcceleratorType.NVIDIA_TESLA_P100
   - aip.AcceleratorType.NVIDIA_TESLA_P4
   - aip.AcceleratorType.NVIDIA_TESLA_T4
   - aip.AcceleratorType.NVIDIA_TESLA_V100

   
Otherwise specify `(None, None)` for the container image for a CPU.
   
*Note*: TF releases before 2.3 for GPU support will fail to load the custom model in this tutorial. It is a known issue and fixed in TF 2.3 -- which is caused by static graph ops that are generated in the serving function. If you encounter this issue on your own custom models, the workaround is to create your own docker container image for TF 2.3 with GPU support.

In [None]:
TRAIN_GPU, TRAIN_NGPU = (None, None)
DEPLOY_GPU, DEPLOY_NGPU = (None, None)

#### Container (Docker) image

Next, we will set the docker container images for training and prediction

- Set the variable `TF` to the Tensorflow version of the container image. For example, `2-1` would be version 2.1, and `1-15` would be version 1.15. Google Cloud continuously adds prebuilt training and prediction container images, below are some of the prebuilt images available:

 - Tensorflow 1.15
   - `gcr.io/cloud-aiplatform/training/tf-cpu.1-15:latest`
   - `gcr.io/cloud-aiplatform/training/tf-gpu.1-15:latest`
   - `gcr.io/cloud-aiplatform/prediction/tf-cpu.1-15:latest`
   - `gcr.io/cloud-aiplatform/prediction/tf-gpu.1-15:latest`
 - Tensorflow 2.1
   - `gcr.io/cloud-aiplatform/training/tf-cpu.2-1:latest`
   - `gcr.io/cloud-aiplatform/training/tf-gpu.2-1:latest`
   - `gcr.io/cloud-aiplatform/prediction/tf2-cpu.2-1:latest`
   - `gcr.io/cloud-aiplatform/prediction/tf2-gpu.2-1:latest`
 - Tensorflow 2.2
   - `gcr.io/cloud-aiplatform/training/tf-cpu.2-2:latest`
   - `gcr.io/cloud-aiplatform/training/tf-gpu.2-2:latest`
   - `gcr.io/cloud-aiplatform/prediction/tf2-cpu.2-2:latest`
   - `gcr.io/cloud-aiplatform/prediction/tf2-gpu.2-2:latest`
 - XGBoost
   - `gcr.io/cloud-aiplatform/training/xgboost-cpu.1-1`
   - `gcr.io/cloud-aiplatform/prediction/xgboost-cpu.1-1`
 - Scikit-learn
   - `gcr.io/cloud-aiplatform/training/scikit-learn-cpu.0-23`
   - `gcr.io/cloud-aiplatform/prediction/scikit-learn-cpu.0-23`
 - Pytorch
   - `gcr.io/cloud-aiplatform/training/pytorch-cpu.1-4:latest`
   - `gcr.io/cloud-aiplatform/training/pytorch-gpu.1-4:latest`
   
Google Cloud AI continously adds new training and prediction container images. For the lastest list, see [Pre-built containers for training](https://cloud.google.com/ai-platform-unified/docs/training/pre-built-containers) and [Pre-built containers for prediction](https://cloud.google.com/ai-platform-unified/docs/predictions/pre-built-containers)

In [None]:
TF = '2-1'
if TF[0] == '2':
    if TRAIN_GPU:
        TRAIN_VERSION = 'tf-gpu.{}'.format(TF)
    else:
        TRAIN_VERSION = 'tf-cpu.{}'.format(TF)
    if DEPLOY_GPU:
        DEPLOY_VERSION = 'tf2-gpu.{}'.format(TF)
    else:
        DEPLOY_VERSION = 'tf2-cpu.{}'.format(TF)
else:
    if TRAIN_GPU:
        TRAIN_VERSION = 'tf-gpu.{}'.format(TF)
    else:
        TRAIN_VERSION = 'tf-cpu.{}'.format(TF)
    if DEPLOY_GPU:
        DEPLOY_VERSION = 'tf-gpu.{}'.format(TF)
    else:
        DEPLOY_VERSION = 'tf-cpu.{}'.format(TF)

TRAIN_IMAGE  = "gcr.io/cloud-aiplatform/training/{}:latest".format(TRAIN_VERSION)
DEPLOY_IMAGE = "gcr.io/cloud-aiplatform/prediction/{}:latest".format(DEPLOY_VERSION)

print("Training:", TRAIN_IMAGE, TRAIN_GPU, TRAIN_NGPU)
print("Deployment:", DEPLOY_IMAGE, DEPLOY_GPU, DEPLOY_NGPU)

#### Machine Type

- Set the variables `TRAIN_COMPUTE` and `DEPLOY_COMPUTE` to the compute instance you will use for training and prediction.
 - `machine type`
     - `n1-standard`: 3.75GB of memory per vCPU.
     - `n1-highmem`: 6.5GB of memory per vCPU
     - `n1-highcpu`: 0.9 GB of memory per vCPU
 - `vCPUs`: number of \[2, 4, 8, 16, 32, 64, 96 \]
  
*Note: The following is not supported for training*
 
 - `standard`: 2 vCPUs
 - `highcpu`: 2, 4 and 8 vCPUs
 
*Note: You may also use n2 and e2 machine types for training and deployment, but they do not support GPUs.*

In [None]:
MACHINE_TYPE = 'n1-standard'
VCPU = '4'
TRAIN_COMPUTE = MACHINE_TYPE + '-' + VCPU
print('Train Compute Instance', TRAIN_COMPUTE)

MACHINE_TYPE = 'n1-standard'
VCPU = '4'
DEPLOY_COMPUTE = MACHINE_TYPE + '-' + VCPU
print('Deploy Compute Instance', DEPLOY_COMPUTE)

# Tutorial

Now you are ready to start creating your own hyperparameter tuning and training of a custom tabular regression model for Boston Housing.

## Clients

The AI Platform (Unified) SDK works as a client/server model. On your side, the Python script, you will create a client that sends requests and receives responses from the server -- AI Platform.

Use several clients in this tutorial, so you will set them all up upfront.

- Job Service for tuning jobs.
- Model Service for managed models.

In [None]:
# client options same for all services
client_options = {"api_endpoint": API_ENDPOINT}

def create_job_client():
    client = aip.JobServiceClient(
        client_options=client_options
    )
    return client


def create_model_client():
    client = aip.ModelServiceClient(
        client_options=client_options
    )
    return client


clients = {}
clients['job'] = create_job_client()
clients['model'] = create_model_client()

for client in clients.items():
    print(client)

## Prepare your custom job specification

Now that your clients are ready, your first step is to create a Job Specification for your hyperparameter tuning job.

You are going to start with tuning what we can call an **empty job**. That is, you will create a job specification that provisions resources for hyperparameter tuning a training a job, and initiate the job using the client job service -- but the hypeparameter tuning job itself will be empty (i.e., no model is being tuned). 

We do this so you can first focus on understanding the basic steps. Afterwards, you will repeat again with a focus on adding the Python training package for hyperparameter tuning of a CIFAR10 custom model.

### Define a container specification

Let's first start by defining a job name and then a container specification:

- `JOB_NAME`: A unique name for your hyperparameter tuning job. For convenience, we appended the name with the current datetime to make the name unique.
- `MODEL_DIR`: A location in your Cloud Storage bucket for storing the model artificats.
- `image_uri`: The location of the container image in your local Cloud Storage bucket. This can be either a Google Cloud prebuilt image or your own custom container.
- `--model-dir`: A command line parameter to the container indicating the location to store the model.

In [None]:
JOB_NAME = "hyperparam_job_" + TIMESTAMP
MODEL_DIR = 'gs://{}/{}'.format(BUCKET_NAME, JOB_NAME)
CONTAINER_SPEC = {
    "image_uri": TRAIN_IMAGE,
    "args": [
        "--model-dir=" + MODEL_DIR
    ],
}

### Define the worker pool specification


Next, you define the worker pool specification for your hyperparameter tuning job. This tells AI Platform what type and how many instances of machines to provision for the tuning.

For this tutorial, you will use a single instance (node). 

- `replica_count`: The number of instances to provision of this machine type.
- `machine_type`: The type of GCP instance to provision -- e.g., n1-standard-8.
- `accelerator_type`: The type, if any, of hardware accelerator. In this tutorial, if `TRAIN_GPU != None` you will be usin a GPU; otherwise a CPU is allocated.
- `accelerator_count`: The number of accelerators.
- `container_spec`: The docker container to install on the instance(s).

In [None]:
if TRAIN_GPU:
    machine_spec = {
        "machine_type": TRAIN_COMPUTE,
        "accelerator_type": TRAIN_GPU,
        "accelerator_count": TRAIN_NGPU
    }
else:
    machine_spec = {
        "machine_type": TRAIN_COMPUTE,
        "accelerator_count": 0
    }
    
WORKER_POOL_SPEC = [
    {
        "replica_count": 1,
        "machine_spec": machine_spec,
        "container_spec": CONTAINER_SPEC,
    }
]

If you were doing distributed tuning, you would add a second machine description and set the replica count accordingly. In the example below, the first machine descrption is the primary (coordinator), and the second ones are the machines the tuning is distributed to.

```
WORKER_POOL_SPEC=[
     {
        "replica_count": 1,
        "machine_spec": {
          "machine_type": "n1-standard-8"
        },
        "container_spec":  CONTAINER_SPEC,
      },
      {
        "replica_count": 6,
        "machine_spec": {
          "machine_type": "n1-standard-8"
        },
        "container_spec": CONTAINER_SPEC
      }
]
```

### Assemble the job specification

Let's now assemble the description for the hyperparameter tuning job specification.

- `display_name`: The human readable name you assign to this custom job.
- `trial_job_spec`: The specification for the custom job. Since this is an empty job, you only specified the resource requirements.
- `study_spec`: The specification for what to tune. Eventhough this is an empty tuning job, the API call requires at least one parameter to tune and one metric to evalute on. We put some placeholders here. We will go into detail when we tune an actual custom model.
- `max_trial_count`: The maximum number of tuning trials. We will go into detail when we tune an actual custom model.
- `parallel_trial_count`: How many trials to try in parallel; otherwise, they are done sequentially.

Let's now go into some more detail on the `study_spec`. There are two sections that are required -- eventhough this is an empty tuning job.

- `parameters`: This is the specification of the hyperparameters that you will tune for the custom training job. It will contain a list of the hyperparameters you've chosen to tune. For each hyperparameter, you specify:
 - `parameter_id`: The Python variable name for the the hyperparameter that you use in your custom Python package.
 - `scale_type`: The scale type determines the resolution the hyperparameter tuning service uses when searching over the search space.
   - `UNIT_LINEAR_SCALE`: Uses a resolution that is the same everywhere in the search space.
   - `UNIT_LOG_SCALE`: Values close to the bottom of the search space are further away.
   - `UNIT_REVERSE_LOG_SCALE`: Values close to the top of the search space are further away.
 - **search space**: This is where you will specify the search space of values for the hyperparameter to select for tuning.
   - `integer_value_spec`: Specifies an integer range of values between a `min_value` and `max_value`.
   - `double_value_spec`: Specifies a continuous range of values between a `min_value` and `max_value`.
   - `discrete_value_spec`: Specifies a list of values.
- `metrics`: This is the specification on how to evaluate the result of each tuning trial.
 - `metric_id`: The name of the objective metric that your Python package will report back to the hyperparameter tuning service.
 - `goal`: Tells the service whether to minimize of maximize the objective metric.

In [None]:
HPT_JOB = {
    "display_name": JOB_NAME,
    "trial_job_spec": {
        "worker_pool_specs": WORKER_POOL_SPEC
    },
    "study_spec" : {
        "parameters": [{
              "parameter_id": "learning_rate",
              "double_value_spec": {"min_value": 1e-07, "max_value": 1},
              "scale_type": aip.StudySpec.ParameterSpec.ScaleType.UNIT_LINEAR_SCALE
        }],
        "metrics": [{
            "metric_id": "val_loss", 
            "goal": aip.StudySpec.MetricSpec.GoalType.MINIMIZE
        }],
    },
    "max_trial_count": 1,
    "parallel_trial_count": 1
}

## Hyperparameter Tuning the model

Let's now start the hyperparameter tuning of your custom training job on AI Platform. Use this helper function `create_hyperparameter_tuning_job`, which takes the parameter:

-`hpt_job`: The specification for the hyperparameter tuning job.

The helper function uses the job client service and calls the method `create_hyperparameter_tuning_job`, with the parameters:

-`parent`: The AI Platform (Unified) location path to dataset, model and endpoint resources.
-`hyperparameter_tuning_job`: The specification for the hyperparameter tuning job.

You will display a handful of the fields returned in `response` object, with the two that are of most interest are:

`response.name`: The AI Platform (Unified) fully qualified identifier assigned to this custom job. We will save this identifier for using in subsequent steps.

`response.state`: The current state of the custom job. 

In [None]:
def create_hyperparameter_tuning_job(hpt_job):
    response = clients['job'].create_hyperparameter_tuning_job(parent=PARENT, hyperparameter_tuning_job=hpt_job)
    print("name:", response.name)
    print("display_name:", response.display_name)
    print("state:", response.state)
    print("create_time:", response.create_time)
    print("update_time:", response.update_time)
    return response.name


# Save the job name
JOB_ID = create_hyperparameter_tuning_job(HPT_JOB)

### List all hyperparameter tuning jobs

Now that your hyperparameter tuning job is running, let's get a list for all your hyperparameter tuning jobs -- specific to your `PROJECT_ID`. This will probably be just one job, unless you've been running this tutorial multiple times or otherwise been using the AI Platform (Unified) job service.

Use the helper function `list_hyperparameter_tuning_jobs`, which uses the job client service and calls the method `list_hyperparameter_tuning_jobs`. The response object is a list, where each element in the list is a separate job.  

The `response` object for each custom job contains:

- `name`: The AI Platform (Unified) fully qualified identifier for your hyperparameter tuning job.
- `display_name`: The human readable name you assigned to your hyperparameter tuning job.
- `job_spec`: The job specification you provided for your hyperparameter tuning job.
- `state`: The current status of the hyperparameter tuning job:
- `start_time`: When the hyperparameter tunimg job was created.
- `end_time`: When the execution of the hyperparameter tuning job ended.
- `update_time`: When the last time there was a status update to the hyperparameter tuning job.

In [None]:
def list_hyperparameter_tuning_jobs():
    response = clients['job'].list_hyperparameter_tuning_jobs(parent=PARENT)
    for job in response:
        print(response)


list_hyperparameter_tuning_jobs()

### Get information on a hyperparameter tuning job

Next, use this helper function `get_hyperparameter_tuning_job`, which takes the parameter:

- `name`: The AI Platform (Unified) fully qualified identifier for the hyperparameter tuning job.

The helper function uses the job client service to get the job information for just this job by calling the method `get_hyperparameter_tuning_job`, with the parameter:

- `name`: The AI Platform (Unified) fully qualified identifier for the hyperparameter tuning job.

If you recall, you got the AI Platform (Unified) fully qualified identifier for the hyperparameter tuning job in the `response.name` field when you called the `create_hyperparameter_tuning_job` method, and saved the identifier in the variable `JOB_ID`.

*Note, since this is a dummy empty job, the job will eventually fail.*

In [None]:
def get_hyperparameter_tuning_job(name, silent=False):
    response = clients['job'].get_hyperparameter_tuning_job(name=name)
    if silent:
        return response

    print("name:", response.name)
    print("display_name:", response.display_name)
    print("state:", response.state)
    print("create_time:", response.create_time)
    print("update_time:", response.update_time)
    return response


job_response = get_hyperparameter_tuning_job(JOB_ID)

### Cancel a hyperparameter tuning job

Next, we will show you how to cancel a hyperparameter tuning job. You will go ahead and cancel your "empty" tuning job. Use this helper function `cancel_job`, with the parameter:

- `name`: The AI Platform (Unified) fully qualified identifier for your hyperparameter tuning job.

The helper function will use the job service client and call the method `cancel_hyperparameter_tuning_job`, with the parameter:

- `name`: The AI Platform (Unified) fully qualified identifier for your hyperparameter tuning job.

We put a try/except around the call since it will throw an exception if the job failed -- which most likely it did.

In [None]:
def cancel_job(name):
    try:
        response = clients['job'].cancel_hyperparameter_tuning_job(name=name)
        print(response)
    except Exception as e:
        print(e)


time.sleep(10)
cancel_job(JOB_ID)

### Delete a hyperparameter tuning job

Next, we will show you how to delete a hyperparameter tuning job. You will go ahead and delete your "empty" tuning job. Use the helper function `delete_job`, with the parameter:

- `name`: The AI Platform (Unified) fully qualified identifier for your hyperparameter tuning job.

The helper function will use the job service client and call the method `delete_hyperparameter_job`, with the parameter:

- `name`: The AI Platform (Unified) fully qualified identifier for your hyperparameter tuning job.

Afterwards, you will verify that the job has been deleted by calling the method `get_hyperparameter_tuning_job` for the same job. We put a try/except around the call since it will throw an exception if the job is already deleted -- which most likely it is.

In [None]:
def delete_job(name):
    try:
        response = clients['job'].delete_hyperparameter_tuning_job(name=name)
        print("Delete", response)
    except Exception as e:
        print(e)

    try:
        response = clients['job'].get_hyperparameter_tuning_job(name=name)
    except Exception as e:
        print(e)


time.sleep(10)
delete_job(JOB_ID)

## Hyperparameter Tune a model - Hello HPT

Now that you have seen the basic steps for hyperparameter tuning a custom training job, you will do a new hyperparameter tuning job for a custom training job. There are two ways you can tune a custom model using a container image:

- **Use a Google Cloud prebuilt container**. If you use a prebuilt container, you will additionally specify a Python package to install into the container image. This Python package contains your code for training a custom model and reporting back the objective metric.

- **Use your own custom container image**. If you use your own container, the container needs to contain your code for training a custom model and reporting back the objective metric.

In this tutorial, you will tune the "hello hpt" using a Google Cloud prebuilt container. You need to update the worker pool specification by adding a description for `python_package_spec`. This section will tell the custom job the Python training package to install and which Python module to invoke, along with command line arguments for the Python module.

Let's dive deeper now into the python package specification:

-`executor_image_spec`: This is the docker image which is configured for your custom training job. You will continue to use the same one we used earlier for demonstration.

-`package_uris`: This is a list of the locations (URIs) of your python training packages to install on the provisioned instance. The locations need to be in a Cloud Storage bucket. These can be either individual python files or a zip (archive) of an entire package. In the later case, the job service will unzip (unarchive) the contents into the docker image.

-`python_module`: The python module (script) to invoke for running the custom training job. In this example, you will be invoking `trainer.task.py` -- note that it was not neccessary to append the `.py` suffix.

-`args`: The command line arguments to pass to the corresponding Python module. In this example, you will be passing the Cloud Storage location where to store the model artifacts -- `"--model-dir=" + MODEL_DIR`.

In [None]:
WORKER_POOL_SPEC = [
    {
        "replica_count": 1,
        "machine_spec": {
            "machine_type": TRAIN_COMPUTE
        },
        "python_package_spec": {
            "executor_image_uri": TRAIN_IMAGE,
            "package_uris":
                ["gs://" + BUCKET_NAME + "/hpt_hello.tar.gz"],
            "python_module": "trainer.task",
            "args": [
                "--model-dir=" + MODEL_DIR,
            ],
        }
    }
]

### Examine the training package

#### Package layout

Before you start the training, let's look at how a Python package is assembled for a custom training job. When unarchived, the package contains the following directory/file layout.

- PKG-INFO
- README.md
- setup.cfg
- setup.py
- trainer
  - \_\_init\_\_.py
  - task.py

The files `setup.cfg` and `setup.py` are the instructions for installing the package into the operating environment of the docker image.

The file `trainer/task.py` is the python script for executing the custom job. *Note*, when we referred to it in the worker pool specification, we replace the directory slash with a dot (`trainer.task`) and dropped the file suffix (`.py`).

#### Package Assembly

In the following cells, you will assemble the hyperparameter tuning package.

In [None]:
# Make folder for python training script
! rm -rf custom
! mkdir custom

# Add package information
! touch custom/README.md

setup_cfg = "[egg_info]\n\
tag_build =\n\
tag_date = 0"
! echo "$setup_cfg" > custom/setup.cfg

setup_py = "import setuptools\n\
# Requires TensorFlow Datasets\n\
setuptools.setup(\n\
    install_requires=[\n\
        'tensorflow_datasets==1.3.0',\n\
    ],\n\
    packages=setuptools.find_packages())" 
! echo "$setup_py" > custom/setup.py

pkg_info = "Metadata-Version: 1.0\n\
Name: Hyperparameter Tuning - Hello World\n\
Version: 0.0.0\n\
Summary: Demostration hyperparameter tuning script\n\
Home-page: www.google.com\n\
Author: Google\n\
Author-email: aferlitsch@gmail.com\n\
License: Public\n\
Description: Demo\n\
Platform: AI Platform (Unified)"
! echo "$pkg_info" > custom/PKG-INFO

# Make the training subfolder
! mkdir custom/trainer
! touch custom/trainer/__init__.py

#### Task.py contents

In the next cell, you will write the contents of the training script task.py. I won't go into detail, it's just there for you to browse. In summary:

- Passes the hyperparameter values for a trial as a command line argument (`parser.add_argument('--lr',...)`)
- Mimics a training loop, where on each loop (epoch) the variable accuracy is set to the loop iteration * the learning rate.
- Reports back the objective metric `accuracy` back to the hyperparameter tuning service using `report_hyperparameter_tuning_metric()`.

In [None]:
%%writefile custom/trainer/task.py
# HP Tuning hello world example
  
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow_datasets as tfds
import tensorflow as tf
from tensorflow.python.client import device_lib
from hypertune import HyperTune
import argparse
import os
import sys
import time
tfds.disable_progress_bar()

parser = argparse.ArgumentParser()
parser.add_argument('--lr', dest='lr',
                    default=0.001, type=float,
                    help='Learning rate.')
parser.add_argument(
    '--model-dir',
    dest='model_dir',
    default='/tmp/saved_model',
    type=str,
    help='Model dir.')
args = parser.parse_args()

print('Python Version = {}'.format(sys.version))
print('TensorFlow Version = {}'.format(tf.__version__))
print('TF_CONFIG = {}'.format(os.environ.get('TF_CONFIG', 'Not found')))
print(device_lib.list_local_devices())

# Instantiate the HyperTune reporting object
hpt = HyperTune()

for epoch in range(1,10):
    # mimic metric result at the end of an epoch
    acc = args.lr * epoch
    # save the metric result to communicate back to the HPT service
    hpt.report_hyperparameter_tuning_metric(
        hyperparameter_metric_tag='accuracy',
        metric_value=acc,
        global_step=epoch)
    print('epoch: {}, accuracy: {}'.format(epoch, acc))
    time.sleep(1)

#### Store training script on your Cloud Storage bucket

Next, we package the training folder into a compressed tar ball, and then store it in your Cloud Storage bucket.

In [None]:
! rm -f custom.tar custom.tar.gz
! tar cvf custom.tar custom
! gzip custom.tar
! gsutil cp custom.tar.gz gs://$BUCKET_NAME/hpt_hello.tar.gz

#### Reporting back the result of the trial using hypertune

For each trial, your Python script needs to report back to the hyperparameter tuning service the objective metric for which you specified as the criteria for evaluating the trial.

For this example, you will specify in the study specification that the objective metric will be reported back as `accuracy`.

You report back the value of the objective metric using `HyperTune`. This Python module is used to communicate key/value pairs to the hyperparameter tuning service. To setup this reporting in your Python package, you will add code for the following three steps:

1. Import the HyperTune module: `from hypertune import HyperTune()`.
2. At the end of every epoch, write the current value of the objective function to the log as a key/value pair using `hpt.report_hyperparameter_tuning_metric()`. In this example, the parameters are:
 - `hyperparameter_metric_tag`: The name of the objective metric to report back. The name must be identical to the name specified in the study specification.
 - `metric_value`: The value of the objective metric to report back to the hyperparameter service.
 - `global_step`: The epoch iteration, starting at 0.

### Create a study specification

Let's start with a simple study. You will just use a single parameter -- the *learning rate*. Since its just one parameter, it doesn't make much sense to do a random search. Instead, we will do a grid search over a range of values.

- `metrics`:
 - `metric_id`: In this example, the objective metric to report back is `'accuracy'` 
 - `goal`: In this example, the hyperparameter tuning service will evaluate trials to maximize the value of the objective metric.
- `parameters`: The specification for the hyperparameters to tune.
 - `parameter_id`: The name of the hyperparameter that will be passed to the Python package as a command line argument. In this example, it is `lr` for the learning rate.
 - `discrete_value_spec`: A discrete set of values as the search space for the hyperparameter.
- `algorithm`: The search method for selecting hyperparameter values per trial:
 - `GRID_SEARCH`: Combinatorically search -- which is used in this example.
 - `RANDOM_SEARCH`: Random search.
 

In [None]:
STUDY_SPEC = {
    "metrics": [{
            "metric_id": "accuracy", 
            "goal": aip.StudySpec.MetricSpec.GoalType.MAXIMIZE
        }],
        "parameters": [{
              "parameter_id": "lr",
              "discrete_value_spec": { 'values': [ 0.001, 0.01, 0.1 ] },
              "scale_type": aip.StudySpec.ParameterSpec.ScaleType.UNIT_LINEAR_SCALE
        }],
    "algorithm": aip.StudySpec.Algorithm.GRID_SEARCH
}

In [None]:
HPT_JOB = {
    "display_name": JOB_NAME,
    "trial_job_spec": {
        "worker_pool_specs": WORKER_POOL_SPEC
    },
    "study_spec" : STUDY_SPEC,
    "max_trial_count": 3,
    "parallel_trial_count": 1
}

### Start the hyperparameter study for the custom training job

Let's now start the hyperparameter tuning of your custom training job on AI Platform. Use this helper function `create_hyperparameter_tuning_job`

In [None]:
# Save the job name
JOB_ID = create_hyperparameter_tuning_job(HPT_JOB)

### Wait for the study to complete

In [None]:
while True:
    job_response = get_hyperparameter_tuning_job(JOB_ID, True)
    if job_response.state != aip.JobState.JOB_STATE_SUCCEEDED:
        print('Study trials have not completed:', job_response.state)
        if job_response.state == aip.JobState.JOB_STATE_FAILED:
            break
    else:
        print('Study trials have completed')
        break
    time.sleep(60)

### Review the results of the study

Let's now look at the results of trials. The response object has the property `trials`. This property is a list, with one entry per trial. In this example, you had three trials, so there are three elements. Each element contains the fields:

- `id`: The trial iteration, starting at 1.
- `state`: The outcome of the trial.
- `parameters`: The hyperparameters and corresponding values settings for the trial.
- `final_measurement`: The reported objective metric from the trial. Note, this is the last reported value via `hpt.report_hyperparameter_tuning_metric()`.

The HPT hello example is written that the higher value of the learning rate, the higher the accuracy -- `acc = lr * epoch`. In this example, the third trial will be the winner.

In [None]:
for trial in job_response.trials:
    print(trial)

## Hyperparameter Tune a model - Boston Housing

Now that you have seen the overall steps for hyperparameter tuning a custom training job using a Python package that mimics training a model, you will do a new hyperparameter tuning job for a custom training job for a Bostom Housing model. 

For this example, you will change two parts:

1. Specify the Boston Housing custom training Python package in place of the HPT hello package.
2. Specify a study specification specific to the hyperparameters used in the Boston Housing custom training Python package.

First, you update the Python package field `package_uris` in the `worker_pool_spec` to the hyperparameter tuning job for Boston Housing Python package.

  - `"--model-dir=" + MODEL_DIR` : The Cloud Storage location where to store the model artifacts for a trial. There are two ways to tell the training script where to save the model artifacts:
      - direct: You pass the Cloud Storage location as a command line argument to your training script (set variable `DIRECT = True`). This will cause the training script to overwrite the model artifacts on each trial. This is recommended if you plan to hypertune only to get the best hyperparameters and will separately do a full training.
      - indirect: The service passes the Cloud Storage location as the environment variable `AIP_MODEL_DIR` to your training script (set variable `DIRECT = False`). In this case, you tell the service the model artifact location in the job specification.  The service will create a unique subfolder (`trail_id/model`) for each trial.  This will cause the training script to write the model artifacts to a unique subfolder on each trial. This is recommend if you plan to do full training on each trial and the best model is the fully trained model.

In [None]:
if TRAIN_GPU:
    machine_spec = {
        "machine_type": TRAIN_COMPUTE,
        "accelerator_type": TRAIN_GPU,
        "accelerator_count": TRAIN_NGPU
    }
else:
    machine_spec = {
        "machine_type": TRAIN_COMPUTE,
        "accelerator_count": 0
    }

DIRECT = False
if DIRECT:
    CMDARGS = [
                "--model-dir=" + MODEL_DIR,
              ]
else:
    CMDARGS = []
    
WORKER_POOL_SPEC = [
    {
        "replica_count": 1,
        "machine_spec": machine_spec,
        "python_package_spec": {
            "executor_image_uri": TRAIN_IMAGE,
            "package_uris":
                ["gs://" + BUCKET_NAME + "/hpt_boston_housing.tar.gz"],
            "python_module": "trainer.task",
            "args": CMDARGS,
        }
    }
]

#### Package Assembly

In the following cells, you will assemble the hyperparameter tuning package.

In [None]:
# Make folder for python training script
! rm -rf custom
! mkdir custom

# Add package information
! touch custom/README.md

setup_cfg = "[egg_info]\n\
tag_build =\n\
tag_date = 0"
! echo "$setup_cfg" > custom/setup.cfg

setup_py = "import setuptools\n\
# Requires TensorFlow Datasets\n\
setuptools.setup(\n\
    install_requires=[\n\
        'tensorflow_datasets==1.3.0',\n\
    ],\n\
    packages=setuptools.find_packages())" 
! echo "$setup_py" > custom/setup.py

pkg_info = "Metadata-Version: 1.0\n\
Name: Hyperparameter Tuning - Boston Housing\n\
Version: 0.0.0\n\
Summary: Demostration hyperparameter tuning script\n\
Home-page: www.google.com\n\
Author: Google\n\
Author-email: aferlitsch@gmail.com\n\
License: Public\n\
Description: Demo\n\
Platform: AI Platform (Unified)"
! echo "$pkg_info" > custom/PKG-INFO

# Make the training subfolder
! mkdir custom/trainer
! touch custom/trainer/__init__.py

#### Task.py contents

In the next cell, you will write the contents of the training script task.py. I won't go into detail, it's just there for you to browse. In summary:

- Parse the command line arguments for the hyperparameter settings for the current trial.
 - Get the directory where to save the model artifacts from the command line (`--model_dir`), and if not specified, then from the environment variable `AIP_MODEL_DIR`.
- Download and preprocess the Boston Housing dataset.
- Build a DNN model.
- The learning rate and number of units per dense layer hyperparameter values are used during the compile of the model.
- A definition of a callback `HPTCallback` which obtains the validation loss at the end of each epoch (`on_epoch_end()`) and reports it to the hyperparameter tuning service using `hpt.report_hyperparameter_tuning_metric()`.
- Train the model with the `fit()` method and specify a callback which will report the validation loss back to the hyperparameter tuning service.

In [None]:
%%writefile custom/trainer/task.py
# Custom Training for Boston Housing
  
import tensorflow_datasets as tfds
import tensorflow as tf
from tensorflow.python.client import device_lib
from hypertune import HyperTune
import numpy as np
import argparse
import os
import sys
tfds.disable_progress_bar()

parser = argparse.ArgumentParser()
parser.add_argument('--model-dir', dest='model_dir',
                    default=os.getenv('AIP_MODEL_DIR'), type=str, help='Model dir.')
parser.add_argument('--lr', dest='lr',
                    default=0.001, type=float,
                    help='Learning rate.')
parser.add_argument('--units', dest='units',
                    default=64, type=int,
                    help='Number of units.')
parser.add_argument('--epochs', dest='epochs',
                    default=20, type=int,
                    help='Number of epochs.')
parser.add_argument('--param-file', dest='param_file',
                    default='/tmp/param.txt', type=str,
                    help='Output file for parameters')
args = parser.parse_args()

print('Python Version = {}'.format(sys.version))
print('TensorFlow Version = {}'.format(tf.__version__))
print('TF_CONFIG = {}'.format(os.environ.get('TF_CONFIG', 'Not found')))

def make_dataset():
  # Scaling Boston Housing data features
  def scale(feature):
    max = np.max(feature)
    feature = (feature / max).astype(np.float)
    return feature, max

  (x_train, y_train), (x_test, y_test) = tf.keras.datasets.boston_housing.load_data(
    path="boston_housing.npz", test_split=0.2, seed=113
  )
  params = []
  for _ in range(13):
    x_train[_], max = scale(x_train[_])
    x_test[_], _ = scale(x_test[_])
    params.append(max)
    
  # store the normalization (max) value for each feature
  with tf.io.gfile.GFile(args.param_file, 'w') as f:
    f.write(str(params))
  return (x_train, y_train), (x_test, y_test)

# Build the Keras model
def build_and_compile_dnn_model():
  model = tf.keras.Sequential([
      tf.keras.layers.Dense(args.units, activation='relu', input_shape=(13,)),
      tf.keras.layers.Dense(args.units, activation='relu'),
      tf.keras.layers.Dense(1, activation='linear')
  ])
  model.compile(
      loss='mse',
      optimizer=tf.keras.optimizers.RMSprop(learning_rate=args.lr))
  return model

model = build_and_compile_dnn_model()

# Instantiate the HyperTune reporting object
hpt = HyperTune()

# Reporting callback
class HPTCallback(tf.keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs=None):
        global hpt
        hpt.report_hyperparameter_tuning_metric(
        hyperparameter_metric_tag='val_loss',
        metric_value=logs['val_loss'],
        global_step=epoch)

# Train the model
BATCH_SIZE = 16
(x_train, y_train), (x_test, y_test) = make_dataset()
model.fit(x_train, y_train, epochs=args.epochs, batch_size=BATCH_SIZE, validation_split=0.1, callbacks=[HPTCallback()])
model.save(args.model_dir)

#### Store training script on your Cloud Storage bucket

Next, we package the training folder into a compressed tar ball, and then store it in your Cloud Storage bucket.

In [None]:
! rm -f custom.tar custom.tar.gz
! tar cvf custom.tar custom
! gzip custom.tar
! gsutil cp custom.tar.gz gs://$BUCKET_NAME/hpt_boston_housing.tar.gz

#### Reporting back the result of the trial using hypertune

For each trial, your Python script needs to report back to the hyperparameter tuning service the objective metric for which you specified as the criteria for evaluating the trial.

For this example, you will specify in the study specification that the objective metric will be reported back as `val_loss`.

You report back the value of the objective metric using `HyperTune`. This Python module is used to communicate key/value pairs to the hyperparameter tuning service. To setup this reporting in your Python package, you will add code for the following three steps:

1. Import the HyperTune module: `from hypertune import HyperTune()`.
2. At the end of every epoch, write the current value of the objective function to the log as a key/value pair using `hpt.report_hyperparameter_tuning_metric()`. In this example, the parameters are:
 - `hyperparameter_metric_tag`: The name of the objective metric to report back. The name must be identical to the name specified in the study specification.
 - `metric_value`: The value of the objective metric to report back to the hyperparameter service.
 - `global_step`: The epoch iteration, starting at 0.

### Create a study specification

In this study, you will tune for two hyparameters using random search algorithm: 

- **learning rate**: The search space is a set of discrete values.
- **units**: The search space is a continuous integer range between 32 and 256.

The objective (goal) is to minimize the validation loss.

You will run a maximum of six trials.

In [None]:
STUDY_SPEC = {
    "metrics": [{
            "metric_id": "val_loss", 
            "goal": aip.StudySpec.MetricSpec.GoalType.MINIMIZE
        }],
        "parameters": [{
              "parameter_id": "lr",
              "discrete_value_spec": { 'values': [ 0.001, 0.01, 0.1 ] },
              "scale_type": aip.StudySpec.ParameterSpec.ScaleType.UNIT_LINEAR_SCALE
        },
        {
              "parameter_id": "units",
              "integer_value_spec": { 'min_value': 32, 'max_value': 256 },
              "scale_type": aip.StudySpec.ParameterSpec.ScaleType.UNIT_LINEAR_SCALE
        }],
    "algorithm": aip.StudySpec.Algorithm.RANDOM_SEARCH
}

### Assemble the job specification

Let's now assemble the description for the hyperparameter tuning job specification:

- `display_name`: The human readable name you assign to this custom job.
- `trial_job_spec`: The specification for the custom job. 
    - `base_output_directory`: This tells the service the Cloud Storage location where to save the model artifacts (when variable `DIRECT = False`). The service will then pass the location to the training script as the environment variable `AIP_MODEL_DIR`, and the path will be of the form:
    
                <output_uri_prefix>/model
                
- `study_spec`: The specification for the hyperparameter tuning study.
- `max_trial_count`: The maximum number of trials for study. On each trial, the training script will be called with the hyperparameter parameter settings selected for that trial.
- `parallel_trial_count`: The number of trials to do in parallel. If set to one, then the trials are done sequentially.             

In [None]:
if DIRECT:
    TRIAL_JOB_SPEC = {
        "worker_pool_specs": WORKER_POOL_SPEC
    }
else:
    TRIAL_JOB_SPEC = {
        "worker_pool_specs": WORKER_POOL_SPEC,
        "base_output_directory": {"output_uri_prefix": MODEL_DIR}
    }

HPT_JOB = {
    "display_name": JOB_NAME,
    "trial_job_spec": TRIAL_JOB_SPEC,
    "study_spec" : STUDY_SPEC,
    "max_trial_count": 6,
    "parallel_trial_count": 1
}

### Start the hyperparameter study for the custom training job

Let's now start the hyperparameter tuning of your custom training job on AI Platform. Use this helper function `create_hyperparameter_tuning_job`

In [None]:
# Save the job name
JOB_ID = create_hyperparameter_tuning_job(HPT_JOB)

### Wait for the study to complete

In [None]:
while True:
    job_response = get_hyperparameter_tuning_job(JOB_ID, True)
    if job_response.state != aip.JobState.JOB_STATE_SUCCEEDED:
        print('Study trials have not completed:', job_response.state)
        if job_response.state == aip.JobState.JOB_STATE_FAILED:
            break
    else:
        print('Study trials have completed')
        break
    time.sleep(60)

### Review the results of the study

Let's now look at the results of trials. 

In [None]:
best = (None, None, None, 0.0)
for trial in job_response.trials:
    print(trial)
    # Keep track of the best outcome
    try:
        if float(trial.final_measurement.metrics[0].value) > best[3]:
            best = (trial.id, float(trial.parameters[0].value), float(trial.parameters[1].value),  
                    float(trial.final_measurement.metrics[0].value))
    except:
        pass

Let's look at which trial was the best:

In [None]:
print("ID", best[0])
print("Decay", best[1])
print("Learning Rate", best[2])
print("Validation Accuracy", best[3])

## Get the Best Model

If you used the method of having the service tell the training script where to save the model artifacts (`DIRECT = False`), then the model artifacts for the best model are saved at:

    MODEL_DIR/<best_trial_id>/model
    
The remaining cells, before cleanup, are only executed if you used this method.

In [None]:
if not DIRECT:
    BEST_MODEL_DIR = MODEL_DIR + '/' + best[0] + '/model'

## Load the saved model

Your model is stored in a TF SavedModel format in a Cloud Storage bucket. Let's go ahead and load it from the Cloud Storage bucket, and then you can do some things, like evaluate the model, and do a prediction.

To load, you use the TF.Keras `model.load_model()` method passing it the Cloud Storage path where the model is saved -- specified by `BEST_MODEL_DIR`.

In [None]:
if DIRECT:
    import tensorflow as tf
    model = tf.keras.models.load_model(BEST_MODEL_DIR)

## Evaluate the model

Now let's find out how good the best model is. 

### Load evaluation data

You will load the Boston Housing test (holdout) data from `tf.keras.datasets`, using the method `load_data()`. This will return the dataset as a tuple of two elements. The first element is the training data and the second is the test data. Each element is also a tuple of two elements: the feature data, and the corresponding labels (median value of owner-occupied home).

You don't need the training data, and hence why we loaded it as `(_, _)`.

Before you can run the data through evaluation, you need to preprocess it:

x_test:
1. Normalize (rescaling) the data in each column by dividing each value by the maximum value of that column. This will replace each single value with a 32-bit floating point number between 0 and 1.

In [None]:
if not DIRECT:
    from tensorflow.keras.datasets import boston_housing
    import numpy as np

    (_, _), (x_test, y_test) = boston_housing.load_data(
        path="boston_housing.npz", test_split=0.2, seed=113)

    def scale(feature):
        max = np.max(feature)
        feature = (feature / max).astype(np.float32)
        return feature

    for _ in range(13):
        x_test[_] = scale(x_test[_])
    x_test = x_test.astype(np.float32)

    print(x_test.shape, x_test.dtype, y_test.shape)

### Evaluate the model

Let's evaluate how well the best model in the tuning job did. 

In [None]:
if not DIRECT:
    model.evaluate(x_test, y_test)

## Upload the model for serving


Next, you will upload your TF.Keras model from the custom job to AI Platform (Unified) model service, which will create a AI Platform (Unified) model resource for your custom model. You will also need to know the name (signature) of the model's serving input layer. You will use this subsequently when making a prediction request.


## Get the serving function signature

You can get the signatures of your model's input and output layers by reloading the model into memory, and querying it for the signatures corresponding to each layer.

For our purpose, you need the signature of the serving function. Why? Well, when you send your data for prediction as a HTTP request packet, you will need to specify the name of the serving input layer of the model in the request.

In [None]:
if not DIRECT:
    loaded = tf.saved_model.load(model_path_to_deploy)

    input_name = list(loaded.signatures['serving_default'].structured_input_signature[1].keys())[0]
    print('Serving function input:', input_name)

### Upload the model

Use this helper function `upload_model` to upload your best model, stored in SavedModel format, up to the model service, which will instantiate a AI Platform (Unified) model instance for your model. Once you've done that, you can use the model in the same way as any other AI Platform (Unified) model instance, such as deploying to an endpoint for serving predictions.

The helper function takes the parameters:

- `display_name`: A human readable name for the endpoint.
- `image_uri`: The container image for the model deployment.
- `model_uri`: The Cloud Storage path to our SavedModel artificat. For this tutorial, this is the Cloud Storage location where the `trainer/task.py` saved the model, which we specified in the variable `MODEL_DIR`.

The helper function uses the model client service and calls the method `upload_model`, which takes the parameters:

- `parent`: The AI Platform (Unified) location root path for dataset, model and endpoint resources. 
- `model`: The specification for the AI Platform (Unified) model instance.

Let's now dive deeper into the AI Platform (Unified) model specification `model`. This is a dictionary object that consists of the following fields:

- `display_name`: A human readable name for the model.
- `metadata_schema_uri`: Since our model was built without a AI Platform (Unified) managed dataset, we will leave this blank (`''`).
- `artificat_uri`: The Cloud Storage path where the model is stored in SavedModel format. 
- `container_spec`: This is the specification for the docker container that will be installed on the endpoint, from which the model will serve predictions. Use the variable you set earlier `DEPLOY_GPU != None` to use a GPU; otherwise only a CPU is allocated.

Uploading a model into a AI Platform (Unified) model resource returns a long running operation, since it may take a few moments. You call `response.result()`, which is a synchronous call and will return when the AI Platform (Unified) model resource is ready. 

The helper function returns the AI Platform (Unified) fully qualified identifier for the corresponding AI Platform (Unified) model instance `upload_model_response.model`. You will save the identifier for subsequent steps in the variable `model_to_deploy_name`.

In [None]:
IMAGE_URI = DEPLOY_IMAGE


def upload_model(display_name, image_uri, model_uri):
    model = {
        "display_name": display_name,
        "metadata_schema_uri": "",
        "artifact_uri": model_uri,
        "container_spec": {
            "image_uri": image_uri,
            "command": [],
            "args": [],
            "env": [{"name": "env_name", "value": "env_value"}],
            "ports": [{"container_port": 8080}],
            "predict_route": "",
            "health_route": "",
        },
    }
    response = clients['model'].upload_model(parent=PARENT, model=model)
    print("Long running operation:", response.operation.name)
    upload_model_response = response.result(timeout=180)
    print("upload_model_response")
    print(" model:", upload_model_response.model)
    return upload_model_response.model


if not DIRECT:
    model_to_deploy_name = upload_model("boston-" + TIMESTAMP, IMAGE_URI, BEST_MODEL_DIR)

### Get model information

Now let's get the model information for just your model. Use this helper function `get_model`, with the parameter:

- `name`: The AI Platform (Unified) unique identifier for the managed model.

This helper function uses the AI Platform (Unified) model client service, and calls the method `get_model`, with the parameter:

- `name`: The AI Platform (Unified) unique identifier for the managed model.

In [None]:
def get_model(name):
    response = clients['model'].get_model(name=name)
    print(response)


if not DIRECT:
    get_model(model_to_deploy_name)

# Cleaning up

To clean up all GCP resources used in this project, you can [delete the GCP
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial:

- Managed model
- Google Storage Bucket

In [None]:
delete_model = True
delete_bucket = True

# Delete the model using the AI Platform (Unified) fully qualified identifier for the model
try:
    if delete_model:
        clients['model'].delete_model(name=model_to_deploy_name)
except Exception as e:
    print(e)

if delete_bucket and 'BUCKET_NAME' in globals():
    ! gsutil rm -r gs://$BUCKET_NAME
        
# Collect any unclaimed memory
import gc
gc.collect()