In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# ONNX Serving on Vertex AI PSC Private Endpoint

## Overview

Compared to the current PSA Private Endpoint, PSC based Private Endpoint has the following benefits:
1. Simpler setup process: Currently, the only extra step user need to do is to create an Endpoint in their VPC. And this will be done by PSC automatically before our GA launch.

2. No more IP exhuasted issue: GKE cluster will be hosted in tenant project VPC, so we can create much bigger cluster and won't affected by ip exhuasted issue in User's VPC.

3. Unified experience with public endpoint: The API is the same as public endpoint, so user can use our SDK/client library. We also provide quota, IAM and monitoring metrics as public endpoint does.


### Install Vertex AI SDK for Python and other required packages

In [None]:
! pip3 install --upgrade --user --quiet google-cloud-aiplatform

In [None]:
! pip3 install tf2onnx \
               onnxruntime

### Restart runtime (Colab only)

To use the newly installed packages, you must restart the runtime on Google Colab.

In [None]:
# import sys

# if "google.colab" in sys.modules:

#     import IPython

#     app = IPython.Application.instance()
#     app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Wait until it's finished before continuing to the next step. ⚠️</b>
</div>


### Authenticate your notebook environment (Colab only)

Authenticate your environment on Google Colab.


In [None]:
# import sys

# if "google.colab" in sys.modules:

#     from google.colab import auth

#     auth.authenticate_user()

In [3]:
import tf2onnx
import onnx

2025-04-01 19:58:46.490510: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Set Google Cloud project information and initialize Vertex AI SDK for Python

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com). Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

- `PROJECT_ID`: Google Cloud project ID where Vertex AI resources are deployed
- `LOCATION`: Google Cloud region where the Vertex AI endpoint is located
- `BUCKET_URI`: Google Cloud Storage bucket URI to store model artifacts and other data

In [4]:
PROJECT_ID = "sandbox-401718"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

In [5]:
# Create GCS Bucket
BUCKET_URI = f"gs://{PROJECT_ID}-pred-benchmark"  # @param {type:"string"}
! gsutil mb -l {LOCATION} -p {PROJECT_ID} {BUCKET_URI}

Creating gs://sandbox-401718-pred-benchmark/...
ServiceException: 409 A Cloud Storage bucket named 'sandbox-401718-pred-benchmark' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.


In [6]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_URI)

## Prepare Test Models

We prepared some test models, feel free to use your own models.

### MNIST Dataset

In [7]:
import tensorflow as tf
print("TensorFlow version:", tf.__version__)

TensorFlow version: 2.13.1


In [8]:
mnist = tf.keras.datasets.mnist

# load daatset
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

### TF Train Example

In [9]:
# Build Model

model = tf.keras.models.Sequential(
    [
        tf.keras.layers.Flatten(input_shape=(28, 28)),
        tf.keras.layers.Dense(128, activation="relu"),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(10),
    ]
)

input_signature = [tf.TensorSpec(shape=(None, 28, 28), dtype=tf.float32, name="x")]
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer="adam", loss=loss_fn, metrics=["accuracy"])

model.fit(x_train, y_train, epochs=5)
model.evaluate(x_test, y_test, verbose=2)
model.save("saved_model")


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
313/313 - 1s - loss: 0.0762 - accuracy: 0.9775 - 576ms/epoch - 2ms/step
INFO:tensorflow:Assets written to: saved_model/assets


INFO:tensorflow:Assets written to: saved_model/assets


In [10]:
# # Load your own model
# model = tf.keras.models.load_model("./saved_model")

In [11]:
# Tensorflow Model Predict
predictions = model(x_train[:1]).numpy()
predictions

array([[ -7.792286 ,  -2.4589105,  -6.757009 ,   7.791324 , -25.718307 ,
         13.021902 , -12.477514 ,  -9.414201 , -12.708923 ,  -3.4792993]],
      dtype=float32)

### Inference ONNX model

In [19]:
# # # Load your own model
# onnx_model = onnx.load("model.onnx")

In [10]:
import tf2onnx
import onnx

onnx_model, _ = tf2onnx.convert.from_keras(model, input_signature, opset=13)
onnx.save(onnx_model, "./model.onnx")


2025-04-01 20:05:26.975973: I tensorflow/core/grappler/devices.cc:66] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0
2025-04-01 20:05:26.976140: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session
2025-04-01 20:05:27.009929: I tensorflow/core/grappler/devices.cc:66] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0
2025-04-01 20:05:27.010128: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session


In [12]:
from onnxruntime import InferenceSession
import numpy as np

# Path to your ONNX model
onnx_model_path = "model.onnx"

input_data = x_train[:1].astype(np.float32)

# Create an inference session
session = InferenceSession(onnx_model_path)

# Get the input name
input_name = session.get_inputs()[0].name

# Run inference.
outputs = session.run(None, {input_name: input_data})
outputs

[array([[ -7.792286 ,  -2.4589112,  -6.7570105,   7.7913237, -25.718304 ,
          13.021903 , -12.477514 ,  -9.414201 , -12.708925 ,  -3.4793   ]],
       dtype=float32)]

## Build custom Serving container

- `ARTIFACT_REPO`: (Prerequisite) Name of the Artifact Registry repository to store the custom serving container image
- `JOB_IMAGE_ID`: Name of the Docker image for the custom serving container
- `VERSION`: Version or tag of the Docker image. Default set as latest.


In [15]:
ARTIFACT_REPO  = "workbench" # @param {type:"string"} ######################
JOB_IMAGE_ID = "vertex-custom-serve" # @param {type:"string"}
VERSION = "latest"

In [13]:
%%writefile app.py

import os
from typing import List
import numpy as np
from onnxruntime import InferenceSession
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import asyncio

# Input
class PredictionInput(BaseModel):
    instances: List[List[List[List[float]]]]

# Variables
MODEL_PATH = "/app/model.onnx"
AIP_HEALTH_ROUTE = os.environ.get("AIP_HEALTH_ROUTE", "/health")
AIP_PREDICT_ROUTE = os.environ.get("AIP_PREDICT_ROUTE", "/predict")

# initiate serving server
app = FastAPI(title="Serving Model")

# load model
@app.on_event("startup")
async def load_inference_session():
    global session
    session = InferenceSession(MODEL_PATH)

# check health
@app.get(AIP_HEALTH_ROUTE, status_code=200)
async def health():
    if session is None:
        return dict(status="unhealthy model not loaded")
    return dict(status="healthz")


# prediction endpoint 
@app.post(AIP_PREDICT_ROUTE)
async def predict(input_data: PredictionInput):

    global session

    if session is None:
        raise HTTPException(status_code=503, detail="Model not loaded yet")

    instances = input_data.instances
    instances = np.array(instances).astype(np.float32)[0]

    # Get the input name
    input_name = session.get_inputs()[0].name

    # Run inference.
    outputs = session.run(None, {input_name: instances})
    print(outputs)

    return dict(predictions=outputs[0].tolist())

Writing app.py


In [14]:
%%writefile Dockerfile

FROM python:3.10-slim

COPY ./requirements.txt /app/requirements.txt
COPY ./model.onnx /app/model.onnx
COPY ./app.py /app/app.py
WORKDIR ./app

RUN apt-get update && apt-get install gcc libffi-dev -y

RUN pip install -r requirements.txt

EXPOSE 8080

CMD ["uvicorn", "--host", "0.0.0.0", "--port", "8080", "app:app"]

Writing Dockerfile


In [18]:
# # Build and push image to reigstry
! docker build . -f Dockerfile -t {LOCATION}-docker.pkg.dev/{PROJECT_ID}/{ARTIFACT_REPO}/{JOB_IMAGE_ID}:{VERSION}
! gcloud auth configure-docker {LOCATION}-docker.pkg.dev --quiet
! docker push {LOCATION}-docker.pkg.dev/{PROJECT_ID}/{ARTIFACT_REPO }/{JOB_IMAGE_ID}:{VERSION}

Sending build context to Docker daemon  2.464MB
Step 1/9 : FROM python:3.10-slim
 ---> 797a4d7093b1
Step 2/9 : COPY ./requirements.txt /app/requirements.txt
 ---> Using cache
 ---> abd4edfdb378
Step 3/9 : COPY ./model.onnx /app/model.onnx
 ---> Using cache
 ---> 12b217a893d0
Step 4/9 : COPY ./app.py /app/app.py
 ---> Using cache
 ---> d38bcf2b2c4f
Step 5/9 : WORKDIR ./app
 ---> Using cache
 ---> e9e9642e406f
Step 6/9 : RUN apt-get update && apt-get install gcc libffi-dev -y
 ---> Using cache
 ---> 0616767449d1
Step 7/9 : RUN pip install -r requirements.txt
 ---> Using cache
 ---> f3c04c21a414
Step 8/9 : EXPOSE 8080
 ---> Using cache
 ---> df6d3db63751
Step 9/9 : CMD ["uvicorn", "--host", "0.0.0.0", "--port", "8080", "app:app"]
 ---> Using cache
 ---> 4bc047f8d2b2
Successfully built 4bc047f8d2b2
Successfully tagged us-west2-docker.pkg.dev/sandbox-401718/workbench/vertex-custom-serve:latest

{
  "credHelpers": {
    "gcr.io": "gcloud",
    "us.gcr.io": "gcloud",
    "eu.gcr.io": "gcloud"

### Upload Model

In [19]:
model = aiplatform.Model.upload(
    display_name=JOB_IMAGE_ID,
    location = LOCATION,
    serving_container_image_uri=f"{LOCATION}-docker.pkg.dev/{PROJECT_ID}/{ARTIFACT_REPO }/{JOB_IMAGE_ID}:{VERSION}",
    serving_container_predict_route='/predict',
    serving_container_health_route='/health',
    serving_container_ports=[8080],
)

model.wait()

Creating Model


INFO:google.cloud.aiplatform.models:Creating Model


Create Model backing LRO: projects/757654702990/locations/us-west2/models/2875552760122572800/operations/2943547108940054528


INFO:google.cloud.aiplatform.models:Create Model backing LRO: projects/757654702990/locations/us-west2/models/2875552760122572800/operations/2943547108940054528


Model created. Resource name: projects/757654702990/locations/us-west2/models/2875552760122572800@1


INFO:google.cloud.aiplatform.models:Model created. Resource name: projects/757654702990/locations/us-west2/models/2875552760122572800@1


To use this Model in another session:


INFO:google.cloud.aiplatform.models:To use this Model in another session:


model = aiplatform.Model('projects/757654702990/locations/us-west2/models/2875552760122572800@1')


INFO:google.cloud.aiplatform.models:model = aiplatform.Model('projects/757654702990/locations/us-west2/models/2875552760122572800@1')


### Create PSC based Prediction Private Endpoint


In [20]:
psc_endpoint = aiplatform.PrivateEndpoint.create(
    display_name="psc-endpoint",
    project=PROJECT_ID,
    location=LOCATION,
    private_service_connect_config=aiplatform.PrivateEndpoint.PrivateServiceConnectConfig(
        project_allowlist=[PROJECT_ID],
    ),
)

Creating PrivateEndpoint


INFO:google.cloud.aiplatform.models:Creating PrivateEndpoint


Create PrivateEndpoint backing LRO: projects/757654702990/locations/us-west2/endpoints/6086804012391202816/operations/48014023517536256


INFO:google.cloud.aiplatform.models:Create PrivateEndpoint backing LRO: projects/757654702990/locations/us-west2/endpoints/6086804012391202816/operations/48014023517536256


PrivateEndpoint created. Resource name: projects/757654702990/locations/us-west2/endpoints/6086804012391202816


INFO:google.cloud.aiplatform.models:PrivateEndpoint created. Resource name: projects/757654702990/locations/us-west2/endpoints/6086804012391202816


To use this PrivateEndpoint in another session:


INFO:google.cloud.aiplatform.models:To use this PrivateEndpoint in another session:


endpoint = aiplatform.PrivateEndpoint('projects/757654702990/locations/us-west2/endpoints/6086804012391202816')


INFO:google.cloud.aiplatform.models:endpoint = aiplatform.PrivateEndpoint('projects/757654702990/locations/us-west2/endpoints/6086804012391202816')


### Deploy Model

In [21]:
# # # load existing model
# model = aiplatform.Model("projects/757654702990/locations/us-central1/models/323587371566104576")

In [None]:
psc_endpoint.deploy(model=model, traffic_percentage=100, machine_type="e2-standard-16")

psc_endpoint.list_models()


### Create Forwarding Rule in Consumer Project

- `NETWORK`: (Prerequisite) VPC network to use for the forwarding rule.
- `subnet`: (Prerequisite) Subnet within the VPC network

#### Best Practices
Service attachment is a network resource that are used by multiple prediction endpoints. It is recommended to have a 1-1 mapping between the service attachment and forwarding rules/ip address. And this forwarding rule/ip address can be used to access all endpoints using the corresponding service attachment. 
Please note service attachment will only be preserved when there is active deployed model. If all models are undeployed from the endpoint for a while, the service attachment will be recycled and a new one will be created when there is a new model deployed. This means that the service attachment can change for the same endpoint if no active models are deployed. Then the forwarding rule should be deleted and recreated to with the new service attachment.

In [23]:
# Load existing endpoint 
# psc_endpoint= aiplatform.PrivateEndpoint(endpoint_name=f"projects/757654702990/locations/us-central1/endpoints/841510124806733824")

In [24]:
service_attachment = psc_endpoint.list_models()[0].private_endpoints.service_attachment
print(service_attachment)

projects/bde788be156874de5-tp/regions/us-west2/serviceAttachments/gkedpm-9b4fced59d3308dc0028e028117390


In [25]:
endpoint_id = psc_endpoint.resource_name.rsplit('/', 1)[-1]
print(endpoint_id)

6086804012391202816


In [26]:
! gcloud ai endpoints describe {endpoint_id} \
--project={PROJECT_ID} \
--region={LOCATION} \
| grep -i serviceAttachment

Using endpoint [https://us-west2-aiplatform.googleapis.com/]
    serviceAttachment: projects/bde788be156874de5-tp/regions/us-west2/serviceAttachments/gkedpm-9b4fced59d3308dc0028e028117390


Then, create an address and a forwarding rule targeting at the service attachment. In this example, default network and subnet are used, replace it with your VPC network and subnet if running in your VPC.

In [28]:
NETWORK = 'beusebio-network' # @param {type:"string"}
subnet = "projects/sandbox-401718/regions/us-central1/subnetworks/beusebio-network" # @param {type:"string"}

In [30]:
! gcloud compute addresses create psc-prediction \
    --region={LOCATION} \
    --subnet={subnet}

Created [https://www.googleapis.com/compute/v1/projects/sandbox-401718/regions/us-west2/addresses/psc-prediction].


In [32]:
! gcloud compute forwarding-rules create op-psc-endpoint \
    --network={NETWORK} \
    --address=psc-prediction \
    --target-service-attachment={service_attachment} \
    --region={LOCATION}

Created [https://www.googleapis.com/compute/v1/projects/sandbox-401718/regions/us-west2/forwardingRules/op-psc-endpoint].


Save the IP address above.

In [None]:
IP_ADDRESS = ! gcloud compute forwarding-rules describe op-psc-endpoint --region={LOCATION} --format='value(IPAddress)'
IP_ADDRESS = IP_ADDRESS[0]
print(IP_ADDRESS)

### Make Predictions

Note: The endpoint is scoped to the region specified by the user, unless global accessibility is enabled.

In [34]:
data = {"instances": [[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0117647061124444, 0.07058823853731155, 0.07058823853731155, 0.07058823853731155, 0.4941176474094391, 0.5333333611488342, 0.686274528503418, 0.10196078568696976, 0.6509804129600525, 1.0, 0.9686274528503418, 0.49803921580314636, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.11764705926179886, 0.1411764770746231, 0.3686274588108063, 0.6039215922355652, 0.6666666865348816, 0.9921568632125854, 0.9921568632125854, 0.9921568632125854, 0.9921568632125854, 0.9921568632125854, 0.8823529481887817, 0.6745098233222961, 0.9921568632125854, 0.9490196108818054, 0.7647058963775635, 0.250980406999588, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1921568661928177, 0.9333333373069763, 0.9921568632125854, 0.9921568632125854, 0.9921568632125854, 0.9921568632125854, 0.9921568632125854, 0.9921568632125854, 0.9921568632125854, 0.9921568632125854, 0.9843137264251709, 0.364705890417099, 0.32156863808631897, 0.32156863808631897, 0.21960784494876862, 0.15294118225574493, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.07058823853731155, 0.8588235378265381, 0.9921568632125854, 0.9921568632125854, 0.9921568632125854, 0.9921568632125854, 0.9921568632125854, 0.7764706015586853, 0.7137255072593689, 0.9686274528503418, 0.9450980424880981, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3137255012989044, 0.6117647290229797, 0.41960784792900085, 0.9921568632125854, 0.9921568632125854, 0.8039215803146362, 0.04313725605607033, 0.0, 0.16862745583057404, 0.6039215922355652, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.054901961237192154, 0.003921568859368563, 0.6039215922355652, 0.9921568632125854, 0.3529411852359772, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.545098066329956, 0.9921568632125854, 0.7450980544090271, 0.007843137718737125, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.04313725605607033, 0.7450980544090271, 0.9921568632125854, 0.27450981736183167, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.13725490868091583, 0.9450980424880981, 0.8823529481887817, 0.6274510025978088, 0.42352941632270813, 0.003921568859368563, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3176470696926117, 0.9411764740943909, 0.9921568632125854, 0.9921568632125854, 0.46666666865348816, 0.09803921729326248, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1764705926179886, 0.729411780834198, 0.9921568632125854, 0.9921568632125854, 0.5882353186607361, 0.10588235408067703, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.062745101749897, 0.364705890417099, 0.9882352948188782, 0.9921568632125854, 0.7333333492279053, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.9764705896377563, 0.9921568632125854, 0.9764705896377563, 0.250980406999588, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.18039216101169586, 0.5098039507865906, 0.7176470756530762, 0.9921568632125854, 0.9921568632125854, 0.8117647171020508, 0.007843137718737125, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.15294118225574493, 0.5803921818733215, 0.8980392217636108, 0.9921568632125854, 0.9921568632125854, 0.9921568632125854, 0.9803921580314636, 0.7137255072593689, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0941176488995552, 0.4470588266849518, 0.8666666746139526, 0.9921568632125854, 0.9921568632125854, 0.9921568632125854, 0.9921568632125854, 0.7882353067398071, 0.30588236451148987, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.09019608050584793, 0.25882354378700256, 0.8352941274642944, 0.9921568632125854, 0.9921568632125854, 0.9921568632125854, 0.9921568632125854, 0.7764706015586853, 0.3176470696926117, 0.007843137718737125, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.07058823853731155, 0.6705882549285889, 0.8588235378265381, 0.9921568632125854, 0.9921568632125854, 0.9921568632125854, 0.9921568632125854, 0.7647058963775635, 0.3137255012989044, 0.03529411926865578, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.21568627655506134, 0.6745098233222961, 0.886274516582489, 0.9921568632125854, 0.9921568632125854, 0.9921568632125854, 0.9921568632125854, 0.95686274766922, 0.5215686559677124, 0.04313725605607033, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.5333333611488342, 0.9921568632125854, 0.9921568632125854, 0.9921568632125854, 0.8313725590705872, 0.529411792755127, 0.5176470875740051, 0.062745101749897, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]]]}

In [37]:
response = psc_endpoint.predict(
            instances=data["instances"], 
            endpoint_override=IP_ADDRESS
        )
print(response.predictions[0])

[-7.792285919189453, -2.458911180496216, -6.757010459899902, 7.791323661804199, -25.71830368041992, 13.0219030380249, -12.47751426696777, -9.414200782775879, -12.70892524719238, -3.479300022125244]




### Deploy another model and update traffic split

Deploy another model, and update the traffic split to be 50:50, after the deployment is done, you can rerun the prediction again for multiple times, you should be able to see the deployed_model_id are different.

In [None]:
# psc_endpoint.deploy(model=model, traffic_percentage=50, machine_type="e2-standard-8")

In [None]:
# import os

# if not os.getenv("IS_TESTING"):
#     import json

#     import urllib3

#     urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

#     counter = {}
#     with open(REQUEST_FILE) as json_file:
#         data = json.load(json_file)
#         for i in range(1000):
#             response = psc_endpoint.predict(
#                 instances=data["instances"], endpoint_override=IP_ADDRESS
#             )
#             if response.deployed_model_id in counter.keys():
#                 counter[response.deployed_model_id] += 1
#             else:
#                 counter[response.deployed_model_id] = 1
#     print(counter)

You can update the traffic split with the following command and run the code above again.

In [None]:
# import os

# if not os.getenv("IS_TESTING"):
#     deployed_model_id_0 = list(counter)[0]
#     deployed_model_id_1 = list(counter)[1]

#     psc_endpoint.update(
#         traffic_split={deployed_model_id_0: 20, deployed_model_id_1: 80}
#     )

## Cleanup (optional)

In [None]:
psc_endpoint.undeploy_all()
psc_endpoint.delete()
model.delete()

In [None]:
! gcloud compute forwarding-rules delete op-psc-endpoint --region={LOCATION}  --quiet

! gcloud compute addresses delete psc-prediction --region={LOCATION} --quiet

Delete the bucket if needed.

In [None]:
! gsutil rm -r {BUCKET_URI}

Optionally, you can use the following command to clean up all private endpoint and models if needed.

In [None]:
for pe in aiplatform.PrivateEndpoint.list():
    pe.undeploy_all()
    pe.delete()