In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Online Prediction PSC based private endpint

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/prediction/get_started_with_psc_private_endpoint.ipynb">
      <img src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fofficial%2Fprediction%2Fget_started_with_psc_private_endpoint.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>    
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/prediction/get_started_with_psc_private_endpoint.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/prediction/get_started_with_psc_private_endpoint.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

## Overview

Compared to the current PSA Private Endpoint, PSC based Private Endpoint has the following benefits:
1. Simpler setup process: Currently, the only extra step user need to do is to create an Endpoint in their VPC. And this will be done by PSC automatically before our GA launch.

2. No more IP exhuasted issue: GKE cluster will be hosted in tenant project VPC, so we can create much bigger cluster and won't affected by ip exhuasted issue in User's VPC.

3. Unified experience with public endpoint: The API is the same as public endpoint, so user can use our SDK/client library. We also provide quota, IAM and monitoring metrics as public endpoint does.


## Get started

### Install Vertex AI SDK for Python and other required packages

In [None]:
! pip3 install --upgrade --user --quiet google-cloud-aiplatform

### Restart runtime (Colab only- NOT NEEDED WHEN YOU USE THE VERTEX AI NOTEBOOK)

To use the newly installed packages, you must restart the runtime on Google Colab.

In [None]:
import sys

if "google.colab" in sys.modules:

    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Wait until it's finished before continuing to the next step. ⚠️</b>
</div>


### Authenticate your notebook environment (Colab only: NOT NEEDED WHEN YOU USE THE VERTEX AI NOTEBOOK))

Authenticate your environment on Google Colab.


In [1]:
import sys

if "google.colab" in sys.modules:

    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information and initialize Vertex AI SDK for Python

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com). Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [2]:
PROJECT_ID = "northam-ce-mlai-tpu"  # @param {type:"string"} #GenBio team: Please add your project id here
LOCATION = "us-central1"  # @param {type:"string"} #GenBio team: Please add your location here

In [3]:
# Create GCS Bucket
BUCKET_URI = "gs://av-genb-testing"  # @param {type:"string"} #GenBio team: Please add the bucket name here
! gsutil mb -l {LOCATION} -p {PROJECT_ID} {BUCKET_URI} 

Creating gs://av-genb-testing/...


In [4]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_URI)

## Prepare Test Models

We prepared some test models, feel free to use your own models.

In [5]:
# Copy Models to the Bucket
! gsutil cp -r "gs://cloud-samples-data/vertex-ai/prediction/test-models-requests/*" {BUCKET_URI} ##GenBio team: Please copy our model artifacts. You might want to change the gcs url here for your source model locations

Copying gs://cloud-samples-data/vertex-ai/prediction/test-models-requests/churn/assets/country.txt [Content-Type=application/octet-stream]...
Copying gs://cloud-samples-data/vertex-ai/prediction/test-models-requests/churn/assets/language.txt [Content-Type=application/octet-stream]...
Copying gs://cloud-samples-data/vertex-ai/prediction/test-models-requests/churn/assets/operating_system.txt [Content-Type=application/octet-stream]...
/ [3 files][  2.0 KiB/  2.0 KiB]                                                
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m cp ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying gs://cloud-samples-data/vertex-ai/prediction/test-models-requests/churn/assets/user_pseudo_id.txt [Content-Type=application/octet-stream]...
Copying gs://cloud-samples-data/vertex-ai/prediction/test-models-requests/churn/ex

### Upload Model

In [12]:
# Depending on which model you wanna use, uncomment the corresponding section below and run the block.

# TF Model
DISPLAY_NAME = "tensorflow model"  # @param {type:"string"} ##GenBio team: Please update the model name here
ARTIFACT_URI = BUCKET_URI + "/tensorflow" ##GenBio team: Please update the artificat uri name here
IMAGE_URI = "us-docker.pkg.dev/vertex-ai/prediction/tf2-gpu.2-12:latest" ###GenBio team: Please update the image here- this is for serving or prediction
REQUEST_FILE = "tensorflow_request.json" #this is the input test data foe the model

In [13]:
model = aiplatform.Model.upload(
    display_name=DISPLAY_NAME,
    artifact_uri=ARTIFACT_URI,
    serving_container_image_uri=IMAGE_URI,
    sync=False,
)

model.wait()

### Create PSC based Prediction Private Endpoint


In [14]:
psc_endpoint = aiplatform.PrivateEndpoint.create(
    display_name="psc-endpoint",
    project=PROJECT_ID,
    location=LOCATION,
    private_service_connect_config=aiplatform.PrivateEndpoint.PrivateServiceConnectConfig(
        project_allowlist=[PROJECT_ID],
    ),
)

Creating PrivateEndpoint
Create PrivateEndpoint backing LRO: projects/9452062936/locations/us-central1/endpoints/8894471825103257600/operations/4693467658941628416
PrivateEndpoint created. Resource name: projects/9452062936/locations/us-central1/endpoints/8894471825103257600
To use this PrivateEndpoint in another session:
endpoint = aiplatform.PrivateEndpoint('projects/9452062936/locations/us-central1/endpoints/8894471825103257600')


Alternatively, send http call to create endpoint. You need to manually replace ALL the variables below

In [None]:
# ! curl -X POST -H "Content-Type: application/json" -H "Authorization: Bearer `gcloud auth print-access-token`" https://${LOCATION}-aiplatform.googleapis.com/v1/projects/${PROJECT_ID}/locations/{LOCATION}/endpoints -d \
# '{ \
#     displayName: "psc-endpoint", \
#     privateServiceConnectConfig: { \
#       enablePrivateServiceConnect: true, \
#       projectAllowlist: ["{PROJECT_ID}"] \
#     }, \
# }'

### Deploy Model

In [15]:
psc_endpoint.deploy(model=model, traffic_percentage=100, machine_type="g2-standard-12", accelerator_type="NVIDIA_L4",
    accelerator_count=1)

psc_endpoint.list_models()

Deploying Model projects/9452062936/locations/us-central1/models/3190082354899058688 to PrivateEndpoint : projects/9452062936/locations/us-central1/endpoints/8894471825103257600
Deploy PrivateEndpoint model backing LRO: projects/9452062936/locations/us-central1/endpoints/8894471825103257600/operations/8031760882730008576
PrivateEndpoint model deployed. Resource name: projects/9452062936/locations/us-central1/endpoints/8894471825103257600


[id: "1049720243712163840"
 model: "projects/9452062936/locations/us-central1/models/3190082354899058688"
 display_name: "tensorflow model"
 create_time {
   seconds: 1754674634
   nanos: 999974000
 }
 dedicated_resources {
   machine_spec {
     machine_type: "g2-standard-12"
     accelerator_type: NVIDIA_L4
     accelerator_count: 1
   }
   min_replica_count: 1
   max_replica_count: 1
 }
 private_endpoints {
   service_attachment: "projects/kb74160a0c3895e22-tp/regions/us-central1/serviceAttachments/gkedpm-76304211f53247f860f25884f92bec"
 }
 model_version_id: "1"
 status {
   last_update_time {
     seconds: 1754676035
     nanos: 305971000
   }
   available_replica_count: 1
 }]

### Create Forwarding Rule in Consumer Project

#### Best Practises
Service attachment is a network resource that are used by multiple prediction endpoints. It is recommended to have a 1-1 mapping between the service attachment and forwarding rules/ip address. And this forwarding rule/ip address can be used to access all endpoints using the corresponding service attachment. 
Please note service attachment will only be preserved when there is active deployed model. If all models are undeployed from the endpoint for a while, the service attachment will be recycled and a new one will be created when there is a new model deployed. This means that the service attachment can change for the same endpoint if no active models are deployed. Then the forwarding rule should be deleted and recreated to with the new service attachment.

#### Create the resources
First, find the service attachment from the endpoint and deployed model.

In [16]:
service_attachment = psc_endpoint.list_models()[0].private_endpoints.service_attachment
print(service_attachment)

projects/kb74160a0c3895e22-tp/regions/us-central1/serviceAttachments/gkedpm-76304211f53247f860f25884f92bec


Then, create an address and a forwarding rule targeting at the service attachment. In this example, default network and subnet are used, replace it with your VPC network and subnet if running in your VPC.

In [21]:
! gcloud compute addresses create psc-prediction \
    --region={LOCATION} \
    --subnet=default ##GenBio team: Please update the subnet you'd like to use here

! gcloud compute forwarding-rules create op-psc-endpoint \
    --network=default \ ##GenBio team: Please update the VPC network you'd like to use here
    --address=psc-prediction \
    --target-service-attachment={service_attachment} \
    --region={LOCATION}

Created [https://www.googleapis.com/compute/v1/projects/northam-ce-mlai-tpu/regions/us-central1/addresses/psc-prediction].
Created [https://www.googleapis.com/compute/v1/projects/northam-ce-mlai-tpu/regions/us-central1/forwardingRules/op-psc-endpoint].


Save the IP address above.

In [22]:
IP_ADDRESS = ! gcloud compute forwarding-rules describe op-psc-endpoint --region={LOCATION} --format='value(IPAddress)'
IP_ADDRESS = IP_ADDRESS[0]
print(IP_ADDRESS)

10.128.0.149


## Make Predictions

From this point, all the code below must be run from a GCP VM in the same VPC, same region as your PSC Endpoint.

If you're using Vertex AI Workbench or Colab Enterprise, you should be good.

If you're creating a GCE VM, please make sure Cloud Platform access scope is enabled.

In [17]:
# Download the requests files:
! gsutil cp {BUCKET_URI}/requests/* ./

Copying gs://av-genb-testing/requests/pytorch_request.json...
Copying gs://av-genb-testing/requests/sklearn_request.json...                   
Copying gs://av-genb-testing/requests/tensorflow_request.json...                
Copying gs://av-genb-testing/requests/vision_small_request.json...              
/ [4 files][ 16.8 KiB/ 16.8 KiB]                                                
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m cp ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying gs://av-genb-testing/requests/xgboost_request.json...
/ [5 files][ 16.9 KiB/ 16.9 KiB]                                                
Operation completed over 5 objects/16.9 KiB.                                     


In [23]:
import os

if not os.getenv("IS_TESTING"):
    import json

    import urllib3

    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

    with open(REQUEST_FILE) as json_file:
        data = json.load(json_file)
        response = psc_endpoint.predict(
            instances=data["instances"], endpoint_override=IP_ADDRESS
        )
        print(response)

Prediction(predictions=[[-357.207214], [-171.659576]], deployed_model_id='1049720243712163840', metadata=None, model_version_id='1', model_resource_name='projects/9452062936/locations/us-central1/models/3190082354899058688', explanations=None)


### Predict Requests

Alternatively, you can send HTTP requests directly to the IP address. Make sure to replace all variabled in the requests

In [24]:
ENDPOINT_RESOURCE_NAME = psc_endpoint.resource_name

In [25]:
import os

if not os.getenv("IS_TESTING"):
    # Predict
    ! curl --insecure -H "Content-Type: application/json" -H "Authorization: Bearer `gcloud auth print-access-token`"  https://{IP_ADDRESS}/v1/{ENDPOINT_RESOURCE_NAME}:predict -d@{REQUEST_FILE}

    # # RawPredict
    # ! curl -v --insecure -H "Content-Type: application/json" -H "Authorization: Bearer `gcloud auth print-access-token`" https://{IP_ADDRESS}/v1/{ENDPOINT_RESOURCE_NAME}:rawPredict -d@{REQUEST_FILE}

{
	"deployedModelId" : "1049720243712163840",
	"model" : "projects/9452062936/locations/us-central1/models/3190082354899058688",
	"modelDisplayName" : "tensorflow model",
	"modelVersionId" : "1",
	"predictions" : 
	[
		[
			-357.207214
		],
		[
			-171.659576
		]
	]
}


### Deploy another model and update traffic split

Deploy another model, and update the traffic split to be 50:50, after the deployment is done, you can rerun the prediction again for multiple times, you should be able to see the deployed_model_id are different.

In [None]:
psc_endpoint.deploy(model=model, traffic_percentage=50, machine_type="e2-standard-8")

In [None]:
import os

if not os.getenv("IS_TESTING"):
    import json

    import urllib3

    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

    counter = {}
    with open(REQUEST_FILE) as json_file:
        data = json.load(json_file)
        for i in range(1000):
            response = psc_endpoint.predict(
                instances=data["instances"], endpoint_override=IP_ADDRESS
            )
            if response.deployed_model_id in counter.keys():
                counter[response.deployed_model_id] += 1
            else:
                counter[response.deployed_model_id] = 1
    print(counter)

You can update the traffic split with the following command and run the code above again.

In [None]:
import os

if not os.getenv("IS_TESTING"):
    deployed_model_id_0 = list(counter)[0]
    deployed_model_id_1 = list(counter)[1]

    psc_endpoint.update(
        traffic_split={deployed_model_id_0: 20, deployed_model_id_1: 80}
    )

## Cleanup

In [None]:
psc_endpoint.undeploy_all()
psc_endpoint.delete()
model.delete()

In [None]:
! gcloud compute forwarding-rules delete op-psc-endpoint --region={LOCATION}  --quiet

! gcloud compute addresses delete psc-prediction --region={LOCATION} --quiet

Delete the bucket if needed.

In [None]:
! gsutil rm -r {BUCKET_URI}

Optionally, you can use the following command to clean up all private endpoint and models if needed.

In [None]:
for pe in aiplatform.PrivateEndpoint.list():
    pe.undeploy_all()
    pe.delete()