In [1]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden - Gemma 3 Deployment on Trillium TPUs

## Before you begin

In [2]:
# @title Setup Google Cloud project

# @markdown 1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

# @markdown 2. **[Optional]** [Create a Cloud Storage bucket](https://cloud.google.com/storage/docs/creating-buckets) for storing experiment outputs. Set the BUCKET_URI for the experiment environment. The specified Cloud Storage bucket (`BUCKET_URI`) should be located in the same region as where the notebook was launched. Note that a multi-region bucket (eg. "us") is not considered a match for a single region covered by the multi-region range (eg. "us-central1"). If not set, a unique GCS bucket will be created instead.

BUCKET_URI = "gs://"  # @param {type:"string"}
BUCKET_URI = "gs://llama31_training-europe"  # @param {type:"string"}

# @markdown 3. **[Optional]** Set region. If not set, the region will be set automatically according to Colab Enterprise environment.

REGION = TPU_DEPLOYMENT_REGION = "europe-west4"  # @param {type:"string"}

# Upgrade Vertex AI SDK.
! pip3 install --upgrade --quiet 'google-cloud-aiplatform>=1.64.0'

# Import the necessary packages
import datetime
import importlib
import os
import uuid
from typing import Tuple

from google.cloud import aiplatform

! git clone https://github.com/GoogleCloudPlatform/vertex-ai-samples.git

models, endpoints = {}, {}

common_util = importlib.import_module(
    "vertex-ai-samples.notebooks.community.model_garden.docker_source_codes.notebook_util.common_util"
)

# Get the default cloud project id.
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]

# Get the default region for launching jobs.
if not REGION:
    if not os.environ.get("GOOGLE_CLOUD_REGION"):
        raise ValueError(
            "REGION must be set. See"
            " https://cloud.google.com/vertex-ai/docs/general/locations for"
            " available cloud locations."
        )
    REGION = os.environ["GOOGLE_CLOUD_REGION"]

# Enable the Vertex AI API and Compute Engine API, if not already.
print("Enabling Vertex AI API and Compute Engine API.")
! gcloud services enable aiplatform.googleapis.com compute.googleapis.com

# Cloud Storage bucket for storing the experiment artifacts.
# A unique GCS bucket will be created for the purpose of this notebook. If you
# prefer using your own GCS bucket, change the value yourself below.
now = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
BUCKET_NAME = "/".join(BUCKET_URI.split("/")[:3])

if BUCKET_URI is None or BUCKET_URI.strip() == "" or BUCKET_URI == "gs://":
    BUCKET_URI = f"gs://{PROJECT_ID}-tmp-{now}-{str(uuid.uuid4())[:4]}"
    BUCKET_NAME = "/".join(BUCKET_URI.split("/")[:3])
    ! gsutil mb -l {REGION} {BUCKET_URI}
else:
    assert BUCKET_URI.startswith("gs://"), "BUCKET_URI must start with `gs://`."
    shell_output = ! gsutil ls -Lb {BUCKET_NAME} | grep "Location constraint:" | sed "s/Location constraint://"
    bucket_region = shell_output[0].strip().lower()
    if bucket_region != REGION:
        raise ValueError(
            "Bucket region %s is different from notebook region %s"
            % (bucket_region, REGION)
        )
print(f"Using this GCS Bucket: {BUCKET_URI}")

STAGING_BUCKET = os.path.join(BUCKET_URI, "temporal")
MODEL_BUCKET = os.path.join(BUCKET_URI, "vllm_tpu")


# Initialize Vertex AI API.
print("Initializing Vertex AI API.")
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

# Gets the default SERVICE_ACCOUNT.
shell_output = ! gcloud projects describe $PROJECT_ID
project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"
print("Using this default Service Account:", SERVICE_ACCOUNT)


# Provision permissions to the SERVICE_ACCOUNT with the GCS bucket
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.admin $BUCKET_NAME

! gcloud config set project $PROJECT_ID
! gcloud projects add-iam-policy-binding --no-user-output-enabled {PROJECT_ID} --member=serviceAccount:{SERVICE_ACCOUNT} --role="roles/storage.admin"
! gcloud projects add-iam-policy-binding --no-user-output-enabled {PROJECT_ID} --member=serviceAccount:{SERVICE_ACCOUNT} --role="roles/aiplatform.user"

fatal: destination path 'vertex-ai-samples' already exists and is not an empty directory.


2025-09-07 09:48:19.923831: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757238499.950123   12050 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757238499.957738   12050 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1757238499.977676   12050 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1757238499.977711   12050 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1757238499.977715   12050 computation_placer.cc:177] computation placer alr

Enabling Vertex AI API and Compute Engine API.
Operation "operations/acat.p2-87995179092-dd49b808-e449-41b7-97e6-2c9bb1a0e83c" finished successfully.
Using this GCS Bucket: gs://llama31_training-europe
Initializing Vertex AI API.
Using this default Service Account: 87995179092-compute@developer.gserviceaccount.com
No changes made to gs://llama31_training-europe/
Updated property [core/project].
 [1] EXPRESSION=request.time < timestamp("2025-05-26T15:00:14.127Z"), TITLE=cloudbuild-connection-setup
 [2] None
 [3] Specify a new condition
The policy contains bindings with conditions, so specifying a condition is 
required when adding a binding. Please specify a condition.:  ^C
 [1] EXPRESSION=request.time < timestamp("2025-05-26T15:00:14.127Z"), TITLE=cloudbuild-connection-setup
 [2] None
 [3] Specify a new condition
The policy contains bindings with conditions, so specifying a condition is 
required when adding a binding. Please specify a condition.:  ^C


In [3]:
# @title Access the models
# @markdown ### Access gemma-3-27b-it models on Vertex AI for serving
# @markdown The models from the Hugging Face can be used for serving in Vertex AI.
# @markdown 1. Open the [gemma-3-27b-it](https://huggingface.co/google/gemma-3-27b-it).
# @markdown 2. Review and accept the agreement.
# @markdown 3. After accepting the agreement, models will be available for serving.
# @markdown 4. You must provide a Hugging Face User Access Token (with read access) to access the Llama 3.1 model. You can follow the [Hugging Face documentation](https://huggingface.co/docs/hub/en/security-tokens) to create a **read** access token and put it in the `HF_TOKEN` field below.

HF_TOKEN = "hf_here"  # @param {type:"string", isTemplate:true}
if not HF_TOKEN:
    print("Provide a read HF_TOKEN to Gemma 3 models from Hugging Face")

In [4]:
# @title Prepare

MODEL_ID = "gemma-3-27b-it"
model_path_prefix = "google/"
model_id = os.path.join(model_path_prefix, MODEL_ID)
model_publisher = "google"
model_publisher_id = "gemma3"
machine_type = "ct6e-standard-4t"
tpu_count = 4
tpu_topo = "2x2"
tpu_type = "TPU_V6e"

vLLM_TPU_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20250819_0917_tpu_experimental_RC01"
# @markdown Set `use_dedicated_endpoint` to False if you don't want to use [dedicated endpoint](https://cloud.google.com/vertex-ai/docs/general/deployment#create-dedicated-endpoint).
use_dedicated_endpoint = True  # @param {type:"boolean"}

# Server parameters.
tensor_parallel_size = tpu_count

# Maximum context length for a request.
max_model_len = 8192  # @param

# Endpoint configurations.
min_replica_count = 1
max_replica_count = 1

run_name = "test-deployment"  # @param {type:"string"}

# @markdown Note: The vLLM-TPU container used in this notebook is in experimental status.

## Deploy prebuilt Llama 3.1 8B or Qwen3 32B models with vLLM on TPUs
This section will download the prebuilt model chosen in the previous section and deploys it to a Vertex AI Endpoint. It takes 15 minutes to 1 hour to finish depending on the size of the model.

In [5]:
# @title Deploy
def deploy_model_vllm_tpu(
    model_name: str,
    model_id: str,
    publisher: str,
    publisher_model_id: str,
    service_account: str,
    base_model_id: str = None,
    tensor_parallel_size: int = 1,
    machine_type: str = "ct6e-standard-1t",
    tpu_topology: str = "1x1",
    max_model_len: int = 4096,
    enable_chunked_prefill: bool = False,
    enable_prefix_cache: bool = False,
    endpoint_id: str = "",
    min_replica_count: int = 1,
    max_replica_count: int = 1,
    use_dedicated_endpoint: bool = False,
    model_type: str = None,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys models with vLLM on TPU in Vertex AI."""
    if endpoint_id:
        aip_endpoint_name = (
            f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_id}"
        )
        endpoint = aiplatform.Endpoint(aip_endpoint_name)
    else:
        endpoint = aiplatform.Endpoint.create(
            display_name=f"{model_name}-endpoint",
            location=TPU_DEPLOYMENT_REGION,
            dedicated_endpoint_enabled=use_dedicated_endpoint,
        )

    if not base_model_id:
        base_model_id = model_id

    if not tensor_parallel_size:
        tensor_parallel_size = int(machine_type[-2])

    num_hosts = int(tpu_topology.split("x")[0])

    vllmtpu_args = [
        "python",
        "-m",
        "vllm.entrypoints.api_server",
        "--host=0.0.0.0",
        "--port=7080",
        f"--model={model_id}",
        f"--tensor_parallel_size={tensor_parallel_size}",
        f"--max_model_len={max_model_len}",
        "--limit_mm_per_prompt.image=0",
    ]

    if enable_chunked_prefill:
        vllmtpu_args.append("--enable-chunked-prefill")

    if enable_prefix_cache:
        vllmtpu_args.append("--enable-prefix-caching")

    env_vars = {
        "MODEL_ID": base_model_id,
        "DEPLOY_SOURCE": "notebook",
        "VLLM_USE_V1": "1",
    }

    # HF_TOKEN is not a compulsory field and may not be defined.
    try:
        if HF_TOKEN:
            env_vars["HF_TOKEN"] = HF_TOKEN
    except NameError:
        pass

    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=vLLM_TPU_DOCKER_URI,
        serving_container_args=vllmtpu_args,
        serving_container_ports=[7080],
        serving_container_predict_route="/generate",
        serving_container_health_route="/ping",
        serving_container_environment_variables=env_vars,
        serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
        serving_container_deployment_timeout=4500,
        model_garden_source_model_name=(
            f"publishers/{publisher}/models/{publisher_model_id}"
        ),
        location=TPU_DEPLOYMENT_REGION,
    )

    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        tpu_topology=tpu_topology if num_hosts > 1 else None,
        deploy_request_timeout=1800,
        service_account=service_account,
        min_replica_count=min_replica_count,
        max_replica_count=max_replica_count,
        system_labels={
            "NOTEBOOK_NAME": "model_garden_pytorch_llama3_1_qwen3_deployment_tpu.ipynb",
        },
    )
    return model, endpoint


models["vllmtpu"], endpoints["vllmtpu"] = deploy_model_vllm_tpu(
    model_name=common_util.get_job_name_with_datetime(prefix=run_name),
    model_id=model_id,
    publisher=model_publisher,
    publisher_model_id=model_publisher_id,
    service_account=SERVICE_ACCOUNT,
    tensor_parallel_size=tensor_parallel_size,
    machine_type=machine_type,
    tpu_topology=tpu_topo,
    max_model_len=max_model_len,
    enable_chunked_prefill=False,
    enable_prefix_cache=False,
    use_dedicated_endpoint=use_dedicated_endpoint,
)

Creating Endpoint
Create Endpoint backing LRO: projects/87995179092/locations/europe-west4/endpoints/8789946752208732160/operations/300970554821705728
Endpoint created. Resource name: projects/87995179092/locations/europe-west4/endpoints/8789946752208732160
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/87995179092/locations/europe-west4/endpoints/8789946752208732160')
Creating Model
Create Model backing LRO: projects/87995179092/locations/europe-west4/models/5957068237383401472/operations/2471705575214284800
Model created. Resource name: projects/87995179092/locations/europe-west4/models/5957068237383401472@1
To use this Model in another session:
model = aiplatform.Model('projects/87995179092/locations/europe-west4/models/5957068237383401472@1')
Deploying model to Endpoint : projects/87995179092/locations/europe-west4/endpoints/8789946752208732160
Deploy Endpoint model backing LRO: projects/87995179092/locations/europe-west4/endpoints/878994675220873

In [6]:
# @title Raw predict

# @markdown Once deployment succeeds, you can send requests to the endpoint with text prompts. Sampling parameters supported by vLLM can be found [here](https://docs.vllm.ai/en/latest/dev/sampling_params.html).

# @markdown Example:

# @markdown ```
# @markdown Human: What is a car?
# @markdown Assistant:  A car, or a motor car, is a road-connected human-transportation system used to move people or goods from one place to another. The term also encompasses a wide range of vehicles, including motorboats, trains, and aircrafts. Cars typically have four wheels, a cabin for passengers, and an engine or motor. They have been around since the early 19th century and are now one of the most popular forms of transportation, used for daily commuting, shopping, and other purposes.
# @markdown ```
# @markdown Additionally, you can moderate the generated text with Vertex AI. See [Moderate text documentation](https://cloud.google.com/natural-language/docs/moderating-text) for more details.

# Loads an existing endpoint instance using the endpoint name:
# - Using `endpoint_name = endpoint.name` allows us to get the
#   endpoint name of the endpoint `endpoint` created in the cell
#   above.
# - Alternatively, you can set `endpoint_name = "1234567890123456789"` to load
#   an existing endpoint with the ID 1234567890123456789.
# You may uncomment the code below to load an existing endpoint.

# endpoint_name = ""  # @param {type:"string"}
# aip_endpoint_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
# )
# endpoint = aiplatform.Endpoint(aip_endpoint_name)

prompt = "What is a car?"  # @param {type: "string"}
# @markdown If you encounter an issue like `ServiceUnavailable: 503 Took too long to respond when processing`, you can reduce the maximum number of output tokens, by lowering `max_tokens`.
max_tokens = 50  # @param {type:"integer"}
temperature = 1.0  # @param {type:"number"}

# @markdown Set `raw_response` to `True` to obtain the raw model output. Set `raw_response` to `False` to apply additional formatting in the structure of `"Prompt:\n{prompt.strip()}\nOutput:\n{output}"`.
raw_response = False  # @param {type:"boolean"}

# Overrides parameters for inferences.
instances = [
    {
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "raw_response": raw_response,
    },
]
response = endpoints["vllmtpu"].predict(
    instances=instances, use_dedicated_endpoint=use_dedicated_endpoint
)

for prediction in response.predictions:
    print(prediction)
# @markdown Note Top-k sampling is not currently enabled for vLLM on TPU.

Prompt:
What is a car?
Output:
 A car is a vehicle that has the ability to transport people or goods from one place to another. They typically have four wheels and operate on roads.

But the concept of a car has evolved over time.

Here's a more comprehensive look


## Clean up resources


In [7]:
# @title Delete the models and endpoints
# @markdown  Delete the experiment models and endpoints to recycle the resources
# @markdown  and avoid unnecessary continuous charges that may incur.

# Undeploy model and delete endpoint.
for endpoint in endpoints.values():
    endpoint.delete(force=True)

# Delete models.
for model in models.values():
    model.delete()

delete_bucket = False  # @param {type:"boolean"}
if delete_bucket:
    ! gsutil -m rm -r $BUCKET_NAME

Undeploying Endpoint model: projects/87995179092/locations/europe-west4/endpoints/8789946752208732160
Undeploy Endpoint model backing LRO: projects/87995179092/locations/europe-west4/endpoints/8789946752208732160/operations/2723907154347032576
Endpoint model undeployed. Resource name: projects/87995179092/locations/europe-west4/endpoints/8789946752208732160
Deleting Endpoint : projects/87995179092/locations/europe-west4/endpoints/8789946752208732160
Endpoint deleted. . Resource name: projects/87995179092/locations/europe-west4/endpoints/8789946752208732160
Deleting Endpoint resource: projects/87995179092/locations/europe-west4/endpoints/8789946752208732160
Delete Endpoint backing LRO: projects/87995179092/locations/europe-west4/operations/1451077309661446144
Endpoint resource projects/87995179092/locations/europe-west4/endpoints/8789946752208732160 deleted.
Deleting Model : projects/87995179092/locations/europe-west4/models/5957068237383401472
Model deleted. . Resource name: projects/8