In [None]:
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# AI Platform (Unified) SDK: AutoML image classification model for online prediction

<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/ai-platform-samples/blob/master/notebooks/deepdive/automl/image/ucaip_automl_image_classification.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/ai-platform-samples/blob/master/notebooks/deepdive/automl/image/ucaip_automl_image_classification.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
</table>
<br/><br/><br/>

# Overview


This tutorial demonstrates how to use the AI Platform (Unified) Python SDK to create image classification models and do online prediction using Google Cloud's [AutoML Vision](https://cloud.google.com/automl).

### Dataset

The dataset used for this tutorial is the [Flowers dataset](https://www.tensorflow.org/datasets/catalog/tf_flowers) from [TensorFlow Datasets](https://www.tensorflow.org/datasets/catalog/overview). The version of the dataset you will use in this tutorial is stored in a public Cloud Storage bucket. The trained model predicts the type of flower an image is from a class of five flowers: daisy, dandelion, rose, sunflower, or tulip.

### Objective 

In this notebook, you will learn how to create a image classification model with AutoML Vision from a Python script using the AI Platform (Unified) SDK. You can alternatively create models with AutoML Vision from the command line using `gcloud` or online using Google Cloud Console.

The steps performed include: 

- Create an AI Platform (Unified) managed Dataset.
- Train the model for up to one hour.
- View the model evaluation.
- Deploy the model to a serving endpoint.
- Make a prediction.
- Undeploy the model.

### Costs 

This tutorial uses billable components of Google Cloud Platform (GCP):

* Cloud AI Platform
* Cloud Storage

Learn about [Cloud AI Platform
pricing](https://cloud.google.com/ml-engine/docs/pricing) and [Cloud Storage
pricing](https://cloud.google.com/storage/pricing), and use the [Pricing
Calculator](https://cloud.google.com/products/calculator/)
to generate a cost estimate based on your projected usage.

## Installation

Install the latest (preview) version of AI Platform (Unified) SDK.

In [None]:
! pip3 install -U google-cloud-aiplatform --user

Install the Google `cloud-storage` library as well.

In [None]:
! pip3 install google-cloud-storage

### Restart the Kernel

Once you've installed the AI Platform (Unified) SDK and Google cloud-storage, you need to restart the notebook kernel so it can find the packages.

In [None]:
import os
if not os.getenv("AUTORUN"):
    # Automatically restart kernel after installs
    import IPython
    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

## Before you begin

### GPU run-time

**Make sure you're running this notebook in a GPU runtime if you have that option. In Colab, select Runtime > Change runtime type > GPU**

### Set up your GCP project

**The following steps are required, regardless of your notebook environment.**

1. [Select or create a GCP project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.

2. [Make sure that billing is enabled for your project.](https://cloud.google.com/billing/docs/how-to/modify-project)

3. [Enable the AI Platform APIs and Compute Engine APIs.](https://console.cloud.google.com/flows/enableapi?apiid=ml.googleapis.com,compute_component)

4. [Google Cloud SDK](https://cloud.google.com/sdk) is already installed in AI Platform Notebooks.

5. Enter your project ID in the cell below. Then run the  cell to make sure the
Cloud SDK uses the right project for all the commands in this notebook.

**Note**: Jupyter runs lines prefixed with `!` as shell commands, and it interpolates Python variables prefixed with `$` into these commands.

#### Project ID

**If you don't know your project ID**, try to get your project ID using `gcloud` command by executing the second cell below.

In [None]:
PROJECT_ID = "[your-project-id]" #@param {type:"string"}

In [None]:
if PROJECT_ID == "" or PROJECT_ID is None or PROJECT_ID == "[your-project-id]":
    # Get your GCP project id from gcloud
    shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID:", PROJECT_ID)

In [None]:
! gcloud config set project $PROJECT_ID

#### Region

You can also change the `REGION` variable, which is used for operations
throughout the rest of this notebook.  Below are regions supported for AI Platform (Unified). We recommend when possible, to choose the region closest to you. 

- Americas: `us-central1`
- Europe: `europe-west4`
- Asia Pacific: `asia-east1`

You can not use a Multi-Regional Storage bucket for training with AI Platform. Not all regions provide support for all AI Platform services. For the lastest support per region, see [Region support for AI Platform (Unified) services](https://cloud.google.com/ai-platform-unified/docs/general/locations)

In [None]:
REGION = 'us-central1' #@param {type: "string"}

#### Timestamp

If you are in a live tutorial session, you might be using a shared test account or project. To avoid name collisions between users on resources created, we create a timestamp for each instance session, and append onto the name of resources which will be created in this tutorial.

In [None]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

### Authenticate your GCP account

**If you are using AI Platform Notebooks**, your environment is already
authenticated. Skip this step.

*Note: If you are on AI Platform notebook and run the cell, the cell knows to skip executing the authentication steps.*

In [None]:
import os
import sys

# If you are running this notebook in Colab, run this cell and follow the
# instructions to authenticate your Google Cloud account. This provides access
# to your Cloud Storage bucket and lets you submit training jobs and prediction
# requests.

# If on AI Platform, then don't execute this code
if not os.path.exists('/opt/deeplearning/metadata/env_version'):
    if 'google.colab' in sys.modules:
        from google.colab import auth as google_auth
        google_auth.authenticate_user()

    # If you are running this tutorial in a notebook locally, replace the string
    # below with the path to your service account key and run this cell to
    # authenticate your Google Cloud account.
    else:
        %env GOOGLE_APPLICATION_CREDENTIALS your_path_to_credentials.json

    # Log in to your account on Google Cloud
    ! gcloud auth login

### Create a Cloud Storage bucket

**The following steps are required if your data is in your own local Cloud Storage bucket, regardless of your notebook environment.**

This tutorial is designed to use training data that is in a public Cloud Storage bucket. You may alternatively use your own training data that you have stored in a local Cloud Storage bucket.

Set the name of your Cloud Storage bucket below. It must be unique across all
Cloud Storage buckets. 

In [None]:
local = False
if local:
    BUCKET_NAME = "[your-bucket-name]" #@param {type:"string"}

In [None]:
if local:
    if BUCKET_NAME == "" or BUCKET_NAME is None or BUCKET_NAME == "[your-bucket-name]":
        BUCKET_NAME = PROJECT_ID + "ucaip-automl-" + TIMESTAMP

**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [None]:
if local:
    ! gsutil mb -l $REGION gs://$BUCKET_NAME

Finally, validate access to your Cloud Storage bucket by examining its contents:

In [None]:
if local:
    ! gsutil ls -al gs://$BUCKET_NAME

### Set up variables

Let's set up some variables used to create an AutoML model.

### Import libraries and define constants

#### Import AI Platform (Unified) SDK

Import the AI Platform (Unified) SDK into our Python environment.

In [None]:
import os
import sys
import time

from google.cloud.aiplatform import gapic as aip

#### AI Platform (Unified) constants

Let's setup some constants for AI Platform (Unified):

- `API_ENDPOINT`: The AI Platform (Unified) API service endpoint for dataset, model, job, pipeline and endpoint services.
- `API_PREDICT_ENDPOINT`: The AI Platform (Unified) API service endpoint for prediction.
- `PARENT`: The AI Platform (Unified) location root path for dataset, model and endpoint resources.

In [None]:
# API Endpoint
API_ENDPOINT = "{}-aiplatform.googleapis.com".format(REGION)
API_PREDICT_ENDPOINT = "{}-prediction-aiplatform.googleapis.com".format(REGION)

# AI Platform (Unified) location root path for your dataset, model and endpoint resources
PARENT = "projects/" + PROJECT_ID + "/locations/" + REGION

#### AutoML constants

Now setup some constants for AutoML:

- Dataset Schemas: Tells the managed dataset service which type of dataset it is.
- Data Labeling (Annotations) Schemas: Tells the managed dataset service how the data is labeled (annotated).
- Dataset Training Schemas: Tells the managed pipelines service the task (e.g., classification) to train the model for.

In [None]:
# Image Dataset type
IMAGE_SCHEMA = 'google-cloud-aiplatform/schema/dataset/metadata/image_1.0.0.yaml'
# Image Labeling type
IMPORT_SCHEMA_IMAGE_CLASSIFICATION = "gs://google-cloud-aiplatform/schema/dataset/ioformat/image_classification_single_label_io_format_1.0.0.yaml"
# Image Training task
TRAINING_IMAGE_CLASSIFICATION_SCHEMA = "gs://google-cloud-aiplatform/schema/trainingjob/definition/automl_image_classification_1.0.0.yaml"

# Tutorial

Now you are ready to start creating your own AutoML Vision model for image classification.

## Clients

The AI Platform (Unified) SDK works as a client/server model. On your side, the Python script, you will create a client that sends requests and receives responses from the server -- AI Platform.

You will use several clients in this tutorial, so set them all up upfront.

- Dataset Service for managed datasets.
- Model Service for managed models.
- Pipeline Service for training.
- Endpoint Service for deployment.
- Prediction Service for serving. *Note*: Prediction has a different service endpoint.

In [None]:
# client options same for all services
client_options = {"api_endpoint": API_ENDPOINT}
predict_client_options = {"api_endpoint": API_PREDICT_ENDPOINT}


def create_dataset_client():
    client = aip.DatasetServiceClient(
        client_options=client_options
    )
    return client


def create_model_client():
    client = aip.ModelServiceClient(
        client_options=client_options
    )
    return client


def create_pipeline_client():
    client = aip.PipelineServiceClient(
        client_options=client_options
    )
    return client


def create_endpoint_client():
    client = aip.EndpointServiceClient(
        client_options=client_options
    )
    return client


def create_prediction_client():
    client = aip.PredictionServiceClient(
        client_options=predict_client_options
    )
    return client


clients = {}
clients['dataset'] = create_dataset_client()
clients['model'] = create_model_client()
clients['pipeline'] = create_pipeline_client()
clients['endpoint'] = create_endpoint_client()
clients['prediction'] = create_prediction_client()

for client in clients.items():
    print(client)

## Dataset

Now that your clients are ready, your first step is to create a managed dataset instance, and then upload the labeled data to it.

### Create managed dataset instance

Use this helper function `create_dataset` to create the instance of your managed dataset. This function does the following:

1. Uses the dataset client service.
2. Creates a AI Platform (Unified) dataset object (`aip.Dataset`), with the parameters:
 - `display_name`: The human-readable name you choose to give it, and
 - `metadata_schema_uri`: The dataset type. For this tutorial this will be the schema for dataset type.
3. Calls the client dataset service method `create_dataset`, with the parameters:
 - `parent`: AI Platform (Unified) location root path for your dataset and model resources.
 - `dataset`: the AI Platform (Unified) dataset object instance you created.
4. The method returns an `operation` object.

An `operation` object is how AI Platform (Unified) handles asynchronous calls for long running operations. While this step usually goes fast, when you first use it in your project, there is a longer delay due to provisioning.

You can use the `operation` object to get status on the operation (e.g., create managed dataset) or to cancel the operation, by invoking an operation method:

| Method      | Description |
| ----------- | ----------- |
| result()    | Waits for the operation to complete and returns a result object in JSON format.      |
| running()   | Returns True/False on whether the operation is still running.        |
| done()      | Returns True/False on whether the operation is completed. |
| canceled()  | Returns True/False on whether the operation was canceled. |
| cancel()    | Cancels the operation (this may take up to 30 seconds). |


In [None]:
TIMEOUT = 60
DATA_SCHEMA = IMAGE_SCHEMA


def create_dataset(name, schema, labels=None, timeout=TIMEOUT):
    start_time = time.time()
    try:
        dataset = aip.Dataset(display_name=name, metadata_schema_uri="gs://" + schema, labels=labels)

        operation = clients['dataset'].create_dataset(parent=PARENT, dataset=dataset)
        print("Long running operation:", operation.operation.name)
        response = operation.result(timeout=TIMEOUT)
        print("time:", time.time() - start_time)
        print("response")
        print(" name:", response.name)
        print(" display_name:", response.display_name)
        print(" metadata_schema_uri:", response.metadata_schema_uri)
        print(" metadata:", dict(response.metadata))
        print(" create_time:", response.create_time)
        print(" update_time:", response.update_time)
        print(" etag:", response.etag)
        print(" labels:", dict(response.labels))
        return {'name': response.name, 'schema': schema}
    except Exception as e:
        print("exception:", e)
        return (None, None)


dataset = create_dataset("automl-" + TIMESTAMP, DATA_SCHEMA)

### Data preparation

The AI Platform (Unified) managed dataset for images has some requirements for your data.

- Images must be stored in a Cloud Storage bucket.
- Each image file must be in an image format (PNG, JPEG, BMP, ...).
- There must be an index file stored in your Cloud Storage bucket that contains the path and label for each image.
- The index file must be either CSV or JSONL.

#### CSV

For image classification, the CSV index file has the requirements:

- No heading.
- First column is the Cloud Storage path to the image.
- Second column is the label.

#### JSONL

For image classification, the JSONL index file has the requirements:

- Each data item is a separate JSON object, on a separate line.
- The key/value pair 'image_gcs_uri' is the Cloud Storage path to the image.
- The key/value pair 'classification_annotation' is the label field.
 - The key/value pair 'display_name' is the label

    { 'image_gcs_uri': image, 'classification_annotation': { 'display_name': label } }
    
*Note*: The dictionary key fields may alternatively be in camelCase. For example, 'image_gcs_uri' can also be 'imageGcsUri'.

### Dataset splitting

#### CSV

Each row entry in a CSV index file can be preceded by a first column that indicates whether the data is part of the training (TRAINING), test (TEST) or validation (VALIDATION) data. Alternatively, AI Platform (Unified) supports the CAIP (pre-AI Platform (Unified)) version of the tags: TRAIN, TEST and VALIDATE. For example:

    TRAINING, "DATA_ITEM", "ITEM_LABEL"
    TEST, "DATA_ITEM", "ITEM_LABEL"
    VALIDATION, "DATA_ITEM", "ITEM_LABEL"
    
#### JSONL

Each object entry in a JSONL index file can have a 'ml_use' key/value pair that indicates whether the data is part of the training (training), test (test) or validation (validation) data.

    { 'image_gcs_uri': image, 'classification_annotation': { 'display_name': label }, 'data_item_resource_labels':{'aiplatform.googleapis.com/ml_use':'training'} }
    
Otherwise, AutoML will automatically split the dataset for you.

#### Location of Cloud Storage training data.

Let's now set the variable `IMPORT_FILE` to the location of the CSV or JSONL index file in Cloud Storage.

Set the local variable `IMPORT_FORMAT` to indicate whether your dataset is a CSV or JSONL index file.

Additionally, you can set the variable `SPLIT_TYPE` to choose how AutoML will handle splitting the dataset into training, test and validation sets:

- DEFAULT: AutoML chooses the split.
- ML_USE: Examples are tagged with which set they below to (TRAINING, TEST, VALIDATION).
- FRACTION: Percentage split ratios specified in `input_config` when training.

In [None]:
# Image Classification
# No split
FLOWERS_CSV = 'gs://cloud-samples-data/vision/automl_classification/flowers/all_data_v2.csv'
# ML_USE split
FLOWERS_SPLIT_CSV = 'gs://cloud-samples-data/ai-platform/flowers/flowers_split.csv'

# No split
FLOWERS_JSONL = 'gs://cloud-samples-data/vision/automl_classification/flowers/flowers.jsonl'
# ML _USE split
FLOWERS_SPLIT_JSONL = 'gs://cloud-samples-data/vision/automl_classification/flowers/flowers-50.jsonl'

IMPORT_FORMAT = 'CSV'  # [CSV, JSONL]
SPLIT_TYPE = 'DEFAULT'  # [ML_USE, FRACTION, DEFAULT]

if IMPORT_FORMAT == 'CSV':
    if SPLIT_TYPE == 'ML_USE':
        IMPORT_FILE = FLOWERS_SPLIT_CSV
    else:
        IMPORT_FILE = FLOWERS_CSV
else:
    if SPLIT_TYPE == 'ML_USE':
        IMPORT_FILE = FLOWERS_SPLIT_JSONL
    else:
        IMPORT_FILE = FLOWERS_JSONL

#### Quick peek at your data

You will use a version of the Flowers dataset that is stored in a public Cloud Storage bucket, using a CSV or JSONL index file. 

Let's start by doing a quick peek at the data. You count the number of examples by counting the number of rows in the CSV or JSONL index file  (`wc -l`) and then peek at the first few rows.

In [None]:
count = ! gsutil cat $IMPORT_FILE | wc -l
print("Number of Examples", int(count[0]))

print("First 10 rows")
! gsutil cat $IMPORT_FILE | head

### Import data

Now, let's import the data into your AI Platform (Unified) managed dataset. Use this helper function `import_data` to import the data. The function does the following:

- Uses the dataset client.
- Calls the client method `import_data`, with the parameters:
 - `name`: The human readable name you give to the dataset (e.g., flowers).
 - `import_configs`: The import configuration.
- `import_configs` A Python list containing a dictionary, with the key/value entries:
 - `gcs_source`: A list of URIs to the paths of the one or more index files.
 - `import_schema_uri`: The schema identifying the labeling type. For this example, we will use the image classification labeling type.

The `import_data()` method returns a long running `operation` object. This will take a few minutes to complete. If you are in a live tutorial, this would be a good time to ask questions, or take a personal break.

In [None]:
IMPORT_SCHEMA = IMPORT_SCHEMA_IMAGE_CLASSIFICATION


def import_data(dataset, gcs_source, schema):
    config = [{
        'gcs_source': {'uris': [gcs_source]},
        'import_schema_uri': schema
    }]
    print("dataset:", dataset['name'])
    start_time = time.time()
    try:
        operation = clients['dataset'].import_data(name=dataset['name'], import_configs=config)
        print("Long running operation:", operation.operation.name)

        result = operation.result()
        print("result:", result)
        print("time:", int(time.time() - start_time), "secs")
        print("error:", operation.exception())
        print("meta :", operation.metadata)
        print("after: running:", operation.running(), "done:", operation.done(), "cancelled:", operation.cancelled())

        return operation
    except Exception as e:
        print("exception:", e)
        return None


import_data(dataset, IMPORT_FILE, IMPORT_SCHEMA)

### Get dataset information

Now that the data is imported into your AI Platform (Unified) managed dataset, lets get some information about the current state of the dataset. Use this helper function `get_dataset`, with the parameter:

- `name`: The AI Platform (Unified) fully qualified dataset identifier, which is in the form:

    projects/*[project_id]*/locations/*[region]*/datasets/*[dataset id]*

The helper function uses the dataset service client's method `get_dataset`, which takes as a parameter:

- `name`: The AI Platform (Unified) fully qualified dataset identifier.
    
You got the fully qualified dataset identifier in the `name` field of the response object when you created the AI Platform (Unified) managed dataset instance.

The method returns an AI Platform (Unified) managed dataset object.

In [None]:
def get_dataset(name):
    response = clients['dataset'].get_dataset(name=name)
    print("TYPE", type(response))

    print("name:", response.name)
    print("display name:", response.display_name)
    print("create_time:", response.create_time)
    print("update_time:", response.update_time)
    print("labels:", response.labels)
    print("metadata_schema_uri:", response.metadata_schema_uri)
    print("metadata:", dict(response.metadata))


get_dataset(dataset['name'])

### List the data items

Let's now use the dataset client service to get a list of all the examples (data items) you uploaded into your AI Platform (Unified) managed dataset.

Use this helper function `list_data_items`, which calls the dataset client service method `list_data_items`, with the parameter:

- `parent` : The AI Platform (Unified) fully qualified managed dataset identifier.

The method returns a list of each data item. Use the helper function to count the number of elements in the response, which corresponds to the total number of examples in the uploaded dataset.

The helper function will return the total count of examples in the dataset, as well as information on the last example `last_item`.

*Note, that the count is 3667, while the original count was 3668. This is because AutoML dataset service found two of the entries to be in error.*

In [None]:
def list_data_items(dataset):
    print("dataset:", dataset)
    try:
        response = clients['dataset'].list_data_items(parent=dataset['name'])
        n = 0
        data_item = None
        for data_item in response:
            n += 1
        print("count:", n)
        return n, data_item
    except Exception as e:
        print("exception:", e)
        return None, None


count, last_item = list_data_items(dataset)

Let's now look at the information on the last example in the dataset. There are a few fields here we are interested in:

- `name` : This is the fully qualified identifier to the data item.

- `labels`: The resource label (e.g., training) assigned to the data item when ML_USE is specified.

- `gcsUri`: This is the Cloud Storage location of the data item.

- `mimeType`: This is the data type of the data item. In this tutorial the data items are JPG compressed images.

In [None]:
print(last_item)

## Train the model

Let's now train an AutoML image classification model using your AI Platform (Unified) managed dataset. To train the model, do the following steps:

1. Create an AI Platform (Unified) managed training pipeline for the dataset.
2. Execute the pipeline to start the training.

### Create a training pipeline

You may ask, what do we use a pipeline for? You typically use pipelines when the job (such as training) has multiple steps, generally in sequential order: do step A, do step B, etc. By putting the steps into a pipeline, we gain the benefits of:

1. Being reusable for subsequent training jobs.
2. Can be containerized and ran as a batch job.
3. Can be distributed.
4. All the steps are associated with the same pipeline job for tracking progress.

Use this helper function `create_pipeline`, which takes the parameters:

- `pipeline_name`: A human readable name for the pipeline job.
- `model_name`: A human readable name for the model.
- `dataset`: The AI Platform (Unified) fully qualified dataset identifier.
- `schema`: The dataset labeling (annotation) schema. For this tutorial, it will be the schema for training an image classification model.
- `task`: A dictionary describing the requirements for the training job.

The helper function uses the AI Platform (Unified) pipeline client service, calling the method `create_pipeline`, which takes the parameters:

- `parent`: The AI Platform (Unified) location root path for your dataset, model and endpoint resources.
- `training_pipeline`: the full specification for the pipeline training job.

Let's look now deeper into the *minimal* requirements for constructing a `training_pipeline` specification:

- `display_name`: A human readable name for the pipeline job.
- `training_task_definition`: The dataset labeling (annotation) schema.
- `training_task_inputs`: A dictionary describing the requirements for the training job.
- `input_data_config`: The dataset specification.
 - `dataset_id`: The AI Platform (Unified) dataset identifier only (non-fully qualified) -- this is the last part of the fully-qualified identifier.
 - `fraction_split`: If specified, the percentages of the dataset to use for training, test and validation. Otherwise, the percentages are automatically selected by AutoML.
- `model_to_upload`: A human readable name for the model. 

In [None]:
def create_pipeline(pipeline_name, model_name, dataset, schema, task):

    dataset_id = dataset.split('/')[-1]
    if SPLIT_TYPE == 'FRACTION':
        input_config = {'dataset_id': dataset_id,
                        'fraction_split': {
                            'training_fraction': 0.8,
                            'validation_fraction': 0.1,
                            'test_fraction': 0.1,
                        }}
    else:
        input_config = {'dataset_id': dataset_id}

    training_pipeline = {
        "display_name": pipeline_name,
        "training_task_definition": schema,
        "training_task_inputs": task,
        "input_data_config": input_config,
        "model_to_upload": {"display_name": model_name},
    }

    try:
        pipeline = clients['pipeline'].create_training_pipeline(parent=PARENT, training_pipeline=training_pipeline)
        print(pipeline)
    except Exception as e:
        print("exception:", e)
        return None
    return pipeline

Next, construct the task requirements. Unlike other parameters which take a Python (JSON-like) dictionary, the `task` field takes a Google protobuf Struct, which is very similar to a Python dictionary. Use the `json_format.ParseDict` method for the conversion. The minimal fields we need to specify are:

- `multi_label`: Whether True/False this is a multi-label (vs single) classification.
- `budget_milli_node_hours`: The maximum time to budget (billed) for training the model, where 1000 = 1 hour. For image classification, the budget must be a minimum of 8 hours.
- `model_type`: The type of deployed model, ex. CLOUD for deploying to Google Cloud.
- `disable_early_stopping`: Whether True/False to let AutoML use its judgement to stop training early or train for the entire budget.

Finally, you create the pipeline by calling the helper function `create_pipeline`, which returns an instance of a training pipeline object.


In [None]:
from google.protobuf import json_format
from google.protobuf.struct_pb2 import Value

SCHEMA = TRAINING_IMAGE_CLASSIFICATION_SCHEMA
PIPE_NAME = "flowers_pipe-" + TIMESTAMP
MODEL_NAME = "flowers_model-" + TIMESTAMP

task = json_format.ParseDict({'multi_label': False,
                              'budget_milli_node_hours': 8000,
                              'model_type': "CLOUD",
                              'disable_early_stopping': False
                             }, Value())

pipeline = create_pipeline(PIPE_NAME, MODEL_NAME, dataset['name'], SCHEMA, task)

### List all training pipelines

Your training pipeline is now executing on Google Cloud AI Platform. Let's start by getting a list of all your pipelines and corresponding execution state. You likely only have one, but if you been experimenting with this tutorial or otherwise have used AI Platform (Unified) pipelines previously, you will see those as well.

Use this helper function `list_training_pipeline`. This function uses the pipeline client service and calls the method `list_training_pipelines`, with the parameter:

- `parent`: The AI Platform (Unified) location root path for your dataset, model and endpoint resources.

The method returns a `response object` as a list, where every element in the list is a pipeline object instance. The field we are most interest in is `response.state`, which should be at this early point: `PIPELINE_STATE_RUNNING` -- which means the model is being trained, but not completed. 

You could also see `PIPELINE_STATE_PENDING`, which indicates either the service has not yet finished provisioning the resources for the training job, or that the training job is momentarily been paused.

In [None]:
def list_training_pipeline():

    response = clients['pipeline'].list_training_pipelines(parent=PARENT)
    for pipeline in response:
        print("pipeline")
        print(" name:", pipeline.name)
        print(" display_name:", pipeline.display_name)
        print(" training_task_definition:", pipeline.training_task_definition)
        print(" training_task_inputs:", dict(pipeline.training_task_inputs))
        print(" state:", pipeline.state)
        print(" create_time:", pipeline.create_time)
        print(" start_time:", pipeline.start_time)
        print(" end_time:", pipeline.end_time)
        print(" update_time:", pipeline.update_time)
        print(" labels:", dict(pipeline.labels))


list_training_pipeline()

### Get information on a training pipeline

Let's now get pipeline information for just this training pipeline instance. Use the pipeline client service and invoke the `get_training_pipeline` method, with the parameter:

- `name`: The AI Platform (Unified) fully qualified pipeline identifier.

When the model is done training, the pipeline state will be `PIPELINE_STATE_SUCCEEDED`.

In [None]:
def get_training_pipeline(name, silent=False):
    response = clients['pipeline'].get_training_pipeline(name=name)
    if silent:
        return response

    print("pipeline")
    print(" name:", response.name)
    print(" display_name:", response.display_name)
    print(" state:", response.state)
    print(" training_task_definition:", response.training_task_definition)
    print(" training_task_inputs:", dict(response.training_task_inputs))
    print(" create_time:", response.create_time)
    print(" start_time:", response.start_time)
    print(" end_time:", response.end_time)
    print(" update_time:", response.update_time)
    print(" labels:", dict(response.labels))
    return response


pipeline_response = get_training_pipeline(pipeline.name)

# Deployment

## Pre-Cooked

Training the above model may take upwards of ~20 minutes time. For expendiency, we have a pre-cooked (already trained) version of this model you can use for the next steps, while you wait for your model to finish training. 

Once your model is done training, you can repeat these steps for your trained model. You can calcuate the actual time it took to train the model by subtracting `end_time` from `start_time`. For your model, you will need to know the fully qualified AI Platform (Unified) managed model identifier, which the pipeline service assigned to it. You can get this from the returned pipeline instance as the field `model_to_deploy.name`.

You can choose between the precooked model or your trained model with the Python variable `precooked` in the cell below.

In [None]:
# Image Classification
PRECOOK_IMAGE_CLASSIFICATION_MODEL = '[not-supported-yet]'
PRECOOK_MODEL = PRECOOK_IMAGE_CLASSIFICATION_MODEL

# Precooked flag
precook = False

if precook:
    model_to_deploy_name = PRECOOK_MODEL
else:
    import time
    while True:
        pipeline_response = get_training_pipeline(pipeline.name, True)
        if pipeline_response.state != aip.PipelineState.PIPELINE_STATE_SUCCEEDED:
            print("Training job has not completed:", pipeline_response.state)
            model_to_deploy_name = None
            if pipeline_response.state == aip.PipelineState.PIPELINE_STATE_FAILED:
                break
        else:
            model_to_deploy = pipeline_response.model_to_upload
            model_to_deploy_name = model_to_deploy.name
            break
        time.sleep(60)

print("model to deploy:", model_to_deploy_name)

## Model information

### List all models

Let's first get a list of all your AI Platform (Unified) managed models. Use this helper function `list_models`. This helper function uses the AI Platform (Unified) model client service, and calls the method `list_models`, with the parameter:

- `parent`: The AI Platform (Unified) location root path for your dataset, model and endpoint resources.

The response object from the call is a list, where each element is a AI Platform (Unified) managed model. For each model, you will display a few fields:

- `name`: The AI Platform (Unified) unique identifier for the managed model.
- `display_name`: The human readable name assigned to the model.
- `create_time`: Timestamp when the model resource was created.
- `update_time`: Timestamp when the model resource was last updated.
- `container`: The container image used for training the model.
- `artifact_uri`: The Cloud Storage location of the model artifact.

In [None]:
def list_models():
    response = clients['model'].list_models(parent=PARENT)
    for model in response:
        print("name", model.name)
        print("display_name", model.display_name)
        print("create_time", model.create_time)
        print("update_time", model.update_time)
        print("container", model.container_spec.image_uri)
        print("artifact_uri", model.artifact_uri)
        print('\n')


list_models()

### Get model information

Now let's get the model information for just your model. Use this helper function `get_model`, with the parameter:

- `name`: The AI Platform (Unified) unique identifier for the managed model.

This helper function uses the AI Platform (Unified) model client service, and calls the method `get_model`, with the parameter:

- `name`: The AI Platform (Unified) unique identifier for the managed model.

In [None]:
def get_model(name):
    response = clients['model'].get_model(name=name)
    print(response)


get_model(model_to_deploy_name)

## Evaluate the model

Now let's find out how good the model service believes your model is. As part of training, some portion of the dataset was set aside as the test (holdout) data, which is used by the pipeline service to evaluate the model.

### List evaluations for all slices

Use this helper function `list_model_evaluations`, which takes the parameter:

- `name`: The AI Platform (Unified) fully qualified model identifier.

This helper function uses the AI Platform (Unified) model client service, and calls the method `list_model_evaluations`, which takes the same parameter. The response object from the call is a list, where each element is an evaluation metric.

For each evaluation -- you probably only have one, we then print all the key names for each metric in the evaluation, and for a small set (`logLoss` and `auPrc`) we print the result.

In [None]:
def list_model_evaluations(name):
    response = clients['model'].list_model_evaluations(parent=name)
    for evaluation in response:
        print("model_evaluation")
        print(" name:", evaluation.name)
        print(" metrics_schema_uri:", evaluation.metrics_schema_uri)
        metrics = json_format.MessageToDict(evaluation._pb.metrics)
        for metric in metrics.keys():
            print(metric)
        print('logloss', metrics['logLoss'])
        print('auPrc', metrics['auPrc'])

    return evaluation.name


last_evaluation = list_model_evaluations(model_to_deploy_name)

### Get evaluations for a slice

Now, let's use the AI Platform (Unified) fully qualified identifier for an evaluation to get just that specific evaluation. Use the last evaluation (`last_evaluation`) from our previous list of evaluations as an example.

Use this helper function `model_evaluation`, which takes as a parameter:

- `name`: The AI Platform (Unified) fully qualified identifier for the specific model evaluation.

The helper function uses the model client service and calls the method `get_model_evaluation`, with the parameter:

- `name`: The AI Platform (Unified) fully qualified identifier for the specific model evaluation.

Next, print the entire evaluation data -- which may seem at first somewhat verbose.

In [None]:
def model_evaluation(name):
    response = clients['model'].get_model_evaluation(name=name)
    print("response")
    print(" name:", response.name)
    print(" metrics_schema_uri:", response.metrics_schema_uri)
    print(" metrics:", json_format.MessageToDict(response._pb.metrics))
    print(" create_time:", response.create_time)
    print(" slice_dimensions:", response.slice_dimensions)
    model_explanation = response.model_explanation
    print(" model_explanation")
    mean_attributions = model_explanation.mean_attributions
    for mean_attribution in mean_attributions:
        print("  mean_attribution")
        print("   baseline_output_value:", mean_attribution.baseline_output_value)
        print("   instance_output_value:", mean_attribution.instance_output_value)
        print(
            "   feature_attributions:",
            json_format.MessageToDict(mean_attribution._pb.feature_attributions),
        )
        print("   output_index:", mean_attribution.output_index)
        print("   output_display_name:", mean_attribution.output_display_name)
        print("   approximation_error:", mean_attribution.approximation_error)


model_evaluation(last_evaluation)

## Deploy the model

Let's now deploy the trained AI Platform (Unified) model you created with AutoML. This requires two steps:

1. Create an endpoint for deploying the model to.

2. Deploy the model to the endpoint.

### Create an endpoint

Use this helper function `create_endpoint` to create an endpoint to deploy the model to for serving predictions, with the parameter:

- `display_name`: A human readable name for the endpoint.

The helper function uses the endpoint client service and calls the method `create_endpoint`, which takes the parameter:

- `display_name`: A human readable name for the endpoint.

Creating an endpoint returns a long running operation, since it may take a few moments to provision the endpoint for serving. You call `response.result()`, which is a synchronous call and will return when the endpoint is ready. The helper function returns the AI Platform (Unified) fully qualified identifier for the endpoint -- `response.name`.


In [None]:
ENDPOINT_NAME = "flowers_endpoint-" + TIMESTAMP


def create_endpoint(display_name):
    endpoint = {"display_name": display_name}
    response = clients['endpoint'].create_endpoint(parent=PARENT, endpoint=endpoint)
    print("Long running operation:", response.operation.name)

    result = response.result(timeout=300)
    print("result")
    print(" name:", result.name)
    print(" display_name:", result.display_name)
    print(" description:", result.description)
    print(" labels:", result.labels)
    print(" create_time:", result.create_time)
    print(" update_time:", result.update_time)
    return result.name


endpoint_name = create_endpoint(ENDPOINT_NAME)

### Compute instance scaling

You have several choices on scaling the compute instances for handling your online prediction requests:

- Single Instance: The online prediction requests are processed on a single compute instance.
  - Set the minimum (`MIN_NODES`) and maximum (`MAX_NODES`) number of compute instances to one. 


- Manual Scaling: The online prediction requests are split across a fixed number of compute instances that you manually specified.
  - Set the minimum (`MIN_NODES`) and maximum (`MAX_NODES`) number of compute instances to the same number of nodes. When a model is first deployed to the instance, the fixed number of compute instances are provisioned and online prediction requests are evenly distributed across them.
  
The minimum number of compute instances corresponds to the field `min_replica_count` and the maximum number of compute instances corresponds to the field `max_replica_count`, in your subsequent deployment request.

In [None]:
MIN_NODES = 1
MAX_NODES = 1

### Deploy model to the endpoint

Use this helper function `deploy_model` to deploy the model to the endpoint you created for serving predictions, with the parameters:

- `model`: The AI Platform (Unified) fully qualified model identifier of the model to upload (deploy) from the training pipeline.
- `deploy_model_display_name`: A human readable name for the deployed model.
- `endpoint`: The AI Platform (Unified) fully qualified endpoint identifier to deploy the model to.

The helper function uses the endpoint client service and calls the method `deploy_model`, which takes the parameters:

- `endpoint`: The AI Platform (Unified) fully qualified endpoint identifier to deploy the model to.
- `deployed_model`: The requirements for deploying the model.
- `traffic_split`: Percent of traffic at endpoint that goes to this model, which is specified as a dictionary of one or more key/value pairs.
   - If only one model, then specify as **{ "0": 100 }**, where "0" refers to this model being uploaded and 100 means 100% of the traffic.
   - If there are existing models on the endpoint, for which the traffic will be split, then specify as, where `model_id` is the model id of an existing model to the deployed endpoint. The percents must add up to 100.
   
           { "0": percent, model_id: percent, ... }

Let's now dive deeper into the `deployed_model` parameter. This parameter is specified as a Python dictionary with the minimum required fields:

- `model`: The AI Platform (Unified) fully qualified model identifier of the (upload) model to deploy.
- `display_name`: A human readable name for the deployed model.
- `automatic_resources`: This refers to how many compute instances (replicas) that are scaled for serving prediction requests. 
  - `min_replica_count`: The number of compute instances to initially provision, which you set earlier as the variable `MIN_NODES`.
  - `max_replica_count`: The maximum number of compute instances to scale to, which you set earlier as the variable `MAX_NODES`.
- `enable_container_logging`: This enables logging of container events, such as execution failures (default is False). This is typically set to True when debugging the deployment and then set to False when deployed for production.

Let's now dive deeper into the `traffic_split` parameter. This parameter is specified as a Python dictionary. This might at first be a tad bit confusing. Let me explain, you can deploy more than one instance of your model to an endpoint, and then set how much (percent) goes to each instance. 

Why would you do that? Perhaps you already have a previous version deployed in production -- let's call that v1. You got better model evaluation on v2, but you don't know for certain that it is really better until you deploy to production. So in the case of traffic split, you might want to deploy v2 to the same endpoint as v1, but it only get's say 10% of the traffic. That way, you can monitor how well it does without disrupting the majority of users -- until you make a final decision.

The method returns a long running operation `response`. You will wait sychronously for the operation to complete by calling the `response.result()`, which will block until the model is deployed. If this is the first time a model is deployed to the endpoint, it may take a few additional minutes to complete provisioning of resources.

In [None]:
DEPLOYED_NAME = "flowers_deployed-" + TIMESTAMP


def deploy_model(model, deployed_model_display_name, endpoint, traffic_split={"0": 100}):

    deployed_model = {
        "model": model,
        "display_name": deployed_model_display_name,
        "automatic_resources": {
            "min_replica_count": MIN_NODES, 
            "max_replica_count": MAX_NODES
        },
        "enable_container_logging": False
    }

    response = clients['endpoint'].deploy_model(
        endpoint=endpoint, deployed_model=deployed_model, traffic_split=traffic_split)

    print("Long running operation:", response.operation.name)
    result = response.result()
    print("result")
    deployed_model = result.deployed_model
    print(" deployed_model")
    print("  id:", deployed_model.id)
    print("  model:", deployed_model.model)
    print("  display_name:", deployed_model.display_name)
    print("  create_time:", deployed_model.create_time)

    return deployed_model.id


deployed_model_id = deploy_model(model_to_deploy_name, DEPLOYED_NAME, endpoint_name)

### List all endpoints

Let's now get a list of all your endpoints. Use this helper function `list_endpoints`. 

The helper function uses the endpoint client service and calls the method `list_endpoints`. The returned response object is a list, with an element for each endpoint. The helper function lists a few example fields for each endpoint:

- `name`: The AI Platform (Unified) identifier for the managed endpoint.
- `display_name`: The human readable name you assigned to the endpoint.
- `create_time`: When the endpoint was created.
- `deployed_models`: The models and associated information that are deployed to this endpoint.

In [None]:
def list_endpoints():
    response = clients['endpoint'].list_endpoints(parent=PARENT)
    for endpoint in response:
        print("name:", endpoint.name)
        print("display name:", endpoint.display_name)
        print("create_time:", endpoint.create_time)
        print("deployed_models", endpoint.deployed_models)
        print("\n")
        
list_endpoints()

### Get information on this endpoint

Now let's get the endpoint information for just your endpoint. Use this helper function `get_endpoint`, with the parameter:

- `name`: The AI Platform (Unified) unique identifier for the managed endpoint.

This helper function uses the AI Platform (Unified) endpoint client service, and calls the method `get_endpoint`, with the parameter:

- `name`: The AI Platform (Unified) unique identifier for the managed endpoint.

In [None]:
def get_endpoint(name):
    response = clients['endpoint'].get_endpoint(name=name)
    print(response)
    
get_endpoint(endpoint_name)

## Make a prediction request

Let's now do a prediction to your deployed model. You will use an arbitrary image out of the dataset as a test image. Don't be concerned that the image was likely used in training the model -- we just want to demonstrate how to make a prediction.

In [None]:
if IMPORT_FORMAT == 'CSV':
    test_item = !gsutil cat $IMPORT_FILE | head -n1
    if SPLIT_TYPE == 'ML_USE':
        _, test_item, test_label = str(test_item[0]).split(',')
    else:
        test_item, test_label = str(test_item[0]).split(',')
elif IMPORT_FORMAT == 'JSONL':
    import json
    test_items = !gsutil cat $IMPORT_FILE | head -n1
    test_data = test_items[0].replace('\'', '"')
    test_data = json.loads(test_data)
    try:
        test_item = test_data['image_gcs_uri']
        test_label = test_data['classification_annotation']['display_name']
    except:
        test_item = test_data['imageGcsUri']
        test_label = test_data['classificationAnnotation']['displayName']

print(test_item, test_label)

Ok, now you have a test item. Use this helper function `predict_item`, which takes the parameters:

- `filename`: The Cloud Storage path to the test item.
- `endpoint`: The AI Platform (Unified) fully qualified identifier for the endpoint where the model was deployed.
- `parameters_dict`: Additional filtering parameters for serving prediction results.

This function uses the prediction client service and calls the `predict` method with the parameters:

- `endpoint`: The AI Platform (Unified) fully qualified identifier for the endpoint where the model was deployed.
- `instances`: A list of instances (encoded images) to predict.
- `parameters`: Additional filtering parameters for serving prediction results.
  - `confidence_threshold`: The threshold for returning predictions. Must be between 0 and 1.
  - `max_predictions`: The maximum number of predictions to return, sorted by confidence.
  
You might ask, how does confidence_threshold affect the model accuracy? The threshold won't change the accuracy. What it changes is recall and precision.

    - Precision: The higher the precision the more likely what is predicted is the correct prediction, but return fewer predictions. Increasing the confidence threshold increases precision.
    - Recall: The higher the recall the more likely a correct prediction is returned in the result, but return more prediction with incorrect prediction.  Decreasing the confidence threshold increases recall.
    
In this example, you will predict for precision. You set the confidence threshold to 0.5 and the maximum number of predictions for a classification to two. Since, all the confidence values across the classes must add up to one, there are only two possible outcomes:

    1. There is a tie, both 0.5, and returns two predictions.
    2. One value is above 0.5 and the rest are below 0.5, and returns one prediction.

Since in this example your test item is in a Cloud Storage bucket, you open and read the contents of the image using `tf.io.gfile.Gfile()`. To pass the test data to the prediction service, you encode the bytes into base64 -- which makes the content safe from modification while transmitting binary data over the network.

Since the `predict()` method can take multiple items (instances), send your single test item as a list of one test item. As a final step, you package the instances list into Google's protobuf format -- which is what you pass to the `predict()` method.

The `response` object returns a list, where each element in the list corresponds to the corresponding image in the request. You will see in the output for each prediction -- in our case there is just one:

- `ids`: The instance ID of each data item.
- `confidences`: The percent of confidence between 0 and 1 in the prediction for each class.
- `displayNames`: The corresponding class names.

In [None]:
import tensorflow as tf
import base64


def predict_item(filename, endpoint, parameters_dict):

    parameters = json_format.ParseDict(parameters_dict, Value())
    with tf.io.gfile.GFile(filename, "rb") as f:
        content = f.read()
    # The format of each instance should conform to the deployed model's prediction input schema.
    instances_list = [{"content": base64.b64encode(content).decode("utf-8")}]
    instances = [json_format.ParseDict(s, Value()) for s in instances_list]

    response = clients['prediction'].predict(endpoint=endpoint, instances=instances, parameters=parameters)
    print("response")
    print(" deployed_model_id:", response.deployed_model_id)
    predictions = response.predictions
    print("predictions")
    for prediction in predictions:
        # See gs://google-cloud-aiplatform/schema/predict/prediction/classification.yaml for the format of the predictions.
        print(" prediction:", dict(prediction))
    return response


response = predict_item(test_item, endpoint_name, {'confidenceThreshold' : 0.5, 'maxPredictions': 2})

#### Parsing the prediction response

The helper function `parse_prediction` is an example of a function to assist in parsing the prediction response. It takes the response object from the `predict()` call and returns a list, with one element per instance in the prediction request. Each elemnent will consist of a tuple:

- The class label
- The confidence level

In [None]:
import numpy as np
def parse_prediction(response):
    predictions = response.predictions
    retval = []
    for prediction in predictions:
        confidence = prediction['confidences']
        classLabel = prediction['displayNames']
        argmax = np.argmax(confidence)
        retval.append([ classLabel[argmax], confidence[argmax]])
    return retval
        
        
predictions = parse_prediction(response)
print(predictions)

## Undeploy the model

Let's now undeploy your model from the serving endpoint. Use this helper function `undeploy_model`, which takes the parameters:

- `deployed_model_id`: The model deployment identifier returned by the endpoint service when the model was deployed.
- `endpoint`: The AI Platform (Unified) fully qualified identifier for the endpoint where the model is deployed.

This function uses the endpoint client service and calls the method `undeploy_model`, with the parameters:

- `deployed_model_id`: The model deployment identifier returned by the endpoint service when the model was deployed.
- `endpoint`: The AI Platform (Unified) fully qualified identifier for the endpoint where the model is deployed.
- `traffic_split`: How to split traffic among the remaining deployed models on the endpoint.

Since this is the only deployed model on the endpoint, you simply can leave `traffic_split` empty by setting it to {}.

In [None]:
def undeploy_model(deployed_model_id, endpoint):
    response = clients['endpoint'].undeploy_model(endpoint=endpoint, deployed_model_id=deployed_model_id, traffic_split={})
    print(response)


undeploy_model(deployed_model_id, endpoint_name)

# Cleaning up

To clean up all GCP resources used in this project, you can [delete the GCP
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial:

- Dataset
- Model
- Endpoint
- Cloud Storage Bucket

In [None]:
delete_dataset = True
delete_model = True
delete_endpoint = True
delete_bucket = True

# Delete the dataset using the AI Platform (Unified) fully qualified identifier for the dataset
try:
    if delete_dataset:
        clients['dataset'].delete_dataset(name=dataset['name'])
except Exception as e:
    print(e)

# Delete the model using the AI Platform (Unified) fully qualified identifier for the model
try:
    if delete_model:
        clients['model'].delete_model(name=model_to_deploy_name)
except Exception as e:
    print(e)

# Delete the endpoint using the AI Platform (Unified) fully qualified identifier for the endpoint
try:
    if delete_endpoint:
        clients['endpoint'].delete_endpoint(name=endpoint_name)
except Exception as e:
    print(e)

if delete_bucket and 'BUCKET_NAME' in globals():
    ! gsutil rm -r gs://$BUCKET_NAME
        
# Collect any unclaimed memory
import gc
gc.collect()