In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Build Custom Serving Containers for Vertex AI Model Registry

## Overview

This method requires users to be responsible for direct development of their own containers. This provides the most granular-level control, enabling users to provide the serving container to customize for specific requirements such as model architectures, dependencies, and serving logic. [Custom Container Requirements](https://cloud.google.com/vertex-ai/docs/predictions/custom-container-requirements)

![method_2a.png](./imgs/method_2a.png)

### Install Vertex AI SDK for Python and other required packages

In [None]:
! pip3 install --upgrade --user --quiet google-cloud-aiplatform

In [None]:
! pip3 install tf2onnx \
               onnxruntime

### Restart runtime (Colab only)

To use the newly installed packages, you must restart the runtime on Google Colab.

In [None]:
# import sys

# if "google.colab" in sys.modules:

#     import IPython

#     app = IPython.Application.instance()
#     app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Wait until it's finished before continuing to the next step. ⚠️</b>
</div>


### Authenticate your notebook environment (Colab only)

Authenticate your environment on Google Colab.


In [None]:
# import sys

# if "google.colab" in sys.modules:

#     from google.colab import auth

#     auth.authenticate_user()

In [3]:
import tf2onnx
import onnx

2025-04-01 19:58:46.490510: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Set Google Cloud project information and initialize Vertex AI SDK for Python

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com). Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

- `PROJECT_ID`: Google Cloud project ID where Vertex AI resources are deployed
- `LOCATION`: Google Cloud region where the Vertex AI endpoint is located
- `BUCKET_URI`: Google Cloud Storage bucket URI to store model artifacts and other data

In [4]:
PROJECT_ID = "sandbox-401718"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

In [5]:
# Create GCS Bucket
BUCKET_URI = f"gs://{PROJECT_ID}-pred-benchmark"  # @param {type:"string"}
! gsutil mb -l {LOCATION} -p {PROJECT_ID} {BUCKET_URI}

Creating gs://sandbox-401718-pred-benchmark/...
ServiceException: 409 A Cloud Storage bucket named 'sandbox-401718-pred-benchmark' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.


In [6]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_URI)

## Prepare Test Models

We prepared some test models, feel free to use your own models.

### MNIST Dataset

In [7]:
import tensorflow as tf
print("TensorFlow version:", tf.__version__)

TensorFlow version: 2.13.1


In [8]:
mnist = tf.keras.datasets.mnist

# load daatset
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

### TF Train Example

In [9]:
# Build Model

model = tf.keras.models.Sequential(
    [
        tf.keras.layers.Flatten(input_shape=(28, 28)),
        tf.keras.layers.Dense(128, activation="relu"),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(10),
    ]
)

input_signature = [tf.TensorSpec(shape=(None, 28, 28), dtype=tf.float32, name="x")]
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer="adam", loss=loss_fn, metrics=["accuracy"])

model.fit(x_train, y_train, epochs=5)
model.evaluate(x_test, y_test, verbose=2)
model.save("saved_model")


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
313/313 - 1s - loss: 0.0762 - accuracy: 0.9775 - 576ms/epoch - 2ms/step
INFO:tensorflow:Assets written to: saved_model/assets


INFO:tensorflow:Assets written to: saved_model/assets


In [10]:
# # Load your own model
# model = tf.keras.models.load_model("./saved_model")

In [11]:
# Tensorflow Model Predict
predictions = model(x_train[:1]).numpy()
predictions

array([[ -7.792286 ,  -2.4589105,  -6.757009 ,   7.791324 , -25.718307 ,
         13.021902 , -12.477514 ,  -9.414201 , -12.708923 ,  -3.4792993]],
      dtype=float32)

### Inference ONNX model

In [19]:
# # # Load your own model
# onnx_model = onnx.load("model.onnx")

In [10]:
import tf2onnx
import onnx

onnx_model, _ = tf2onnx.convert.from_keras(model, input_signature, opset=13)
onnx.save(onnx_model, "./model.onnx")


2025-04-01 20:05:26.975973: I tensorflow/core/grappler/devices.cc:66] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0
2025-04-01 20:05:26.976140: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session
2025-04-01 20:05:27.009929: I tensorflow/core/grappler/devices.cc:66] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0
2025-04-01 20:05:27.010128: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session


In [12]:
from onnxruntime import InferenceSession
import numpy as np

# Path to your ONNX model
onnx_model_path = "model.onnx"

input_data = x_train[:1].astype(np.float32)

# Create an inference session
session = InferenceSession(onnx_model_path)

# Get the input name
input_name = session.get_inputs()[0].name

# Run inference.
outputs = session.run(None, {input_name: input_data})
outputs

[array([[ -7.792286 ,  -2.4589112,  -6.7570105,   7.7913237, -25.718304 ,
          13.021903 , -12.477514 ,  -9.414201 , -12.708925 ,  -3.4793   ]],
       dtype=float32)]

## Build custom Serving container

- `ARTIFACT_REPO`: (Prerequisite) Name of the Artifact Registry repository to store the custom serving container image
- `JOB_IMAGE_ID`: Name of the Docker image for the custom serving container
- `VERSION`: Version or tag of the Docker image. Default set as latest.


In [15]:
ARTIFACT_REPO  = "workbench" # @param {type:"string"} ######################
JOB_IMAGE_ID = "vertex-custom-serve" # @param {type:"string"}
VERSION = "latest"

In [13]:
%%writefile app.py

import os
from typing import List
import numpy as np
from onnxruntime import InferenceSession
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import asyncio

# Input
class PredictionInput(BaseModel):
    instances: List[List[List[List[float]]]]

# Variables
MODEL_PATH = "/app/model.onnx"
AIP_HEALTH_ROUTE = os.environ.get("AIP_HEALTH_ROUTE", "/health")
AIP_PREDICT_ROUTE = os.environ.get("AIP_PREDICT_ROUTE", "/predict")

# initiate serving server
app = FastAPI(title="Serving Model")

# load model
@app.on_event("startup")
async def load_inference_session():
    global session
    session = InferenceSession(MODEL_PATH)

# check health
@app.get(AIP_HEALTH_ROUTE, status_code=200)
async def health():
    if session is None:
        return dict(status="unhealthy model not loaded")
    return dict(status="healthz")


# prediction endpoint 
@app.post(AIP_PREDICT_ROUTE)
async def predict(input_data: PredictionInput):

    global session

    if session is None:
        raise HTTPException(status_code=503, detail="Model not loaded yet")

    instances = input_data.instances
    instances = np.array(instances).astype(np.float32)[0]

    # Get the input name
    input_name = session.get_inputs()[0].name

    # Run inference.
    outputs = session.run(None, {input_name: instances})
    print(outputs)

    return dict(predictions=outputs[0].tolist())

Writing app.py


In [14]:
%%writefile Dockerfile

FROM python:3.10-slim

COPY ./requirements.txt /app/requirements.txt
COPY ./model.onnx /app/model.onnx
COPY ./app.py /app/app.py
WORKDIR ./app

RUN apt-get update && apt-get install gcc libffi-dev -y

RUN pip install -r requirements.txt

EXPOSE 8080

CMD ["uvicorn", "--host", "0.0.0.0", "--port", "8080", "app:app"]

Writing Dockerfile


In [18]:
# # Build and push image to reigstry
! docker build . -f Dockerfile -t {LOCATION}-docker.pkg.dev/{PROJECT_ID}/{ARTIFACT_REPO}/{JOB_IMAGE_ID}:{VERSION}
! gcloud auth configure-docker {LOCATION}-docker.pkg.dev --quiet
! docker push {LOCATION}-docker.pkg.dev/{PROJECT_ID}/{ARTIFACT_REPO }/{JOB_IMAGE_ID}:{VERSION}

Sending build context to Docker daemon  2.464MB
Step 1/9 : FROM python:3.10-slim
 ---> 797a4d7093b1
Step 2/9 : COPY ./requirements.txt /app/requirements.txt
 ---> Using cache
 ---> abd4edfdb378
Step 3/9 : COPY ./model.onnx /app/model.onnx
 ---> Using cache
 ---> 12b217a893d0
Step 4/9 : COPY ./app.py /app/app.py
 ---> Using cache
 ---> d38bcf2b2c4f
Step 5/9 : WORKDIR ./app
 ---> Using cache
 ---> e9e9642e406f
Step 6/9 : RUN apt-get update && apt-get install gcc libffi-dev -y
 ---> Using cache
 ---> 0616767449d1
Step 7/9 : RUN pip install -r requirements.txt
 ---> Using cache
 ---> f3c04c21a414
Step 8/9 : EXPOSE 8080
 ---> Using cache
 ---> df6d3db63751
Step 9/9 : CMD ["uvicorn", "--host", "0.0.0.0", "--port", "8080", "app:app"]
 ---> Using cache
 ---> 4bc047f8d2b2
Successfully built 4bc047f8d2b2
Successfully tagged us-west2-docker.pkg.dev/sandbox-401718/workbench/vertex-custom-serve:latest

{
  "credHelpers": {
    "gcr.io": "gcloud",
    "us.gcr.io": "gcloud",
    "eu.gcr.io": "gcloud"

### Upload Model

In [19]:
model = aiplatform.Model.upload(
    display_name=JOB_IMAGE_ID,
    location = LOCATION,
    serving_container_image_uri=f"{LOCATION}-docker.pkg.dev/{PROJECT_ID}/{ARTIFACT_REPO }/{JOB_IMAGE_ID}:{VERSION}",
    serving_container_predict_route='/predict',
    serving_container_health_route='/health',
    serving_container_ports=[8080],
)

model.wait()

Creating Model


INFO:google.cloud.aiplatform.models:Creating Model


Create Model backing LRO: projects/757654702990/locations/us-west2/models/2875552760122572800/operations/2943547108940054528


INFO:google.cloud.aiplatform.models:Create Model backing LRO: projects/757654702990/locations/us-west2/models/2875552760122572800/operations/2943547108940054528


Model created. Resource name: projects/757654702990/locations/us-west2/models/2875552760122572800@1


INFO:google.cloud.aiplatform.models:Model created. Resource name: projects/757654702990/locations/us-west2/models/2875552760122572800@1


To use this Model in another session:


INFO:google.cloud.aiplatform.models:To use this Model in another session:


model = aiplatform.Model('projects/757654702990/locations/us-west2/models/2875552760122572800@1')


INFO:google.cloud.aiplatform.models:model = aiplatform.Model('projects/757654702990/locations/us-west2/models/2875552760122572800@1')


### Note: Containers without the model included

Alternatively, if model artifacts are stored separately in Cloud Storage, the container downloads them at startup using the AIP_STORAGE_URI environment variable provided by Vertex AI. Users can then register the model in the Model Registry by specifying the `custom serving image` along with the `artifact_uri` parameters.

![method_2b.png](./imgs/method_2b.png)

Documenation for [Accessing Model Artifacts](https://cloud.google.com/vertex-ai/docs/predictions/custom-container-requirements#artifacts)



In [20]:
psc_endpoint = aiplatform.PrivateEndpoint.create(
    display_name="psc-endpoint",
    project=PROJECT_ID,
    location=LOCATION,
    private_service_connect_config=aiplatform.PrivateEndpoint.PrivateServiceConnectConfig(
        project_allowlist=[PROJECT_ID],
    ),
)

Creating PrivateEndpoint


INFO:google.cloud.aiplatform.models:Creating PrivateEndpoint


Create PrivateEndpoint backing LRO: projects/757654702990/locations/us-west2/endpoints/6086804012391202816/operations/48014023517536256


INFO:google.cloud.aiplatform.models:Create PrivateEndpoint backing LRO: projects/757654702990/locations/us-west2/endpoints/6086804012391202816/operations/48014023517536256


PrivateEndpoint created. Resource name: projects/757654702990/locations/us-west2/endpoints/6086804012391202816


INFO:google.cloud.aiplatform.models:PrivateEndpoint created. Resource name: projects/757654702990/locations/us-west2/endpoints/6086804012391202816


To use this PrivateEndpoint in another session:


INFO:google.cloud.aiplatform.models:To use this PrivateEndpoint in another session:


endpoint = aiplatform.PrivateEndpoint('projects/757654702990/locations/us-west2/endpoints/6086804012391202816')


INFO:google.cloud.aiplatform.models:endpoint = aiplatform.PrivateEndpoint('projects/757654702990/locations/us-west2/endpoints/6086804012391202816')
