In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Gemma 3 on Vertex AI

Adapted from: [Vertex AI Model Garden - Gemma 3 Deployment on GitHub](https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_gemma3_deployment_on_vertex.ipynb)

Modified by: [Wan Qi Ang](https://github.com/angwanqi)

Last updated: 21 May 2025

## Before you begin
### Import libraries

In [None]:
from google.cloud import aiplatform

### Set your project ID and region

In [None]:
# Get the default cloud project id.
PROJECT_ID= !(gcloud config get-value core/project)
PROJECT_ID = PROJECT_ID[0]

# Set the default region
REGION = "asia-southeast1"

use_dedicated_endpoint=False

endpoints = {}

print(f"Project ID:", PROJECT_ID)
print(f"Project Region:", REGION)

## Predicting with Vertex AI Prediction Endpoints

### Load your existing model

First, let's retrieved the details of the endpoints

In [None]:
# Retrieve the Vertex AI Prediction Endpoint ID and set it
all_endpoints = aiplatform.Endpoint.list(location=REGION)

for endpoint in all_endpoints:
    full_endpoint = f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint.name}"
    
    if endpoint.display_name == "ai-takeoff-gemma":
        endpoints['gemma3'] = aiplatform.Endpoint(full_endpoint)

print(f"Gemma 3 Endpoint Name: {endpoints['gemma3'].display_name}")

# List all existing endpoints
# !gcloud ai endpoints list --project=$PROJECT_ID --region=$REGION

## Making predictions

This part assumes that you have a Vertex endpoint with Gemma 3 deployed. With the model deployed, you can send requests to the endpoint with text prompts. Sampling parameters supported by vLLM can be found [here](https://docs.vllm.ai/en/latest/dev/sampling_params.html).

**Example:**

```
Human: What is a car?

Assistant:  A car, or a motor car, is a road-connected human-transportation system used to move people or goods from one place to another. The term also encompasses a wide range of vehicles, including motorboats, trains, and aircrafts. Cars typically have four wheels, a cabin for passengers, and an engine or motor. They have been around since the early 19th century and are now one of the most popular forms of transportation, used for daily commuting, shopping, and other purposes.
```

Additionally, you can moderate the generated text with Vertex AI. See [Moderate text documentation](https://cloud.google.com/natural-language/docs/moderating-text) for more details.

In [None]:
# Insert your prompt here
prompt = "What is a car?"

max_tokens = 100
temperature = 1.0
top_p = 1.0
top_k = 1

# Set `raw_response` to `True` to obtain the raw model output. 
# Set `raw_response` to `False` to apply additional formatting in the structure of `"Prompt:\n{prompt.strip()}\nOutput:\n{output}"`.
raw_response = False

<div class="alert alert-block alert-warning">
<b>⚠️ If you encounter an issue like `ServiceUnavailable: 503 Took too long to respond when processing`, you can reduce the maximum number of output tokens, by lowering `max_tokens`. ⚠️</b>
</div>

In [None]:
# Overrides parameters for inferences.
instances = [
    {
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "top_p": top_p,
        "top_k": top_k,
        "raw_response": raw_response,
    },
]
response = endpoints['gemma3'].predict(
    instances=instances, use_dedicated_endpoint=use_dedicated_endpoint
)

In [None]:
for prediction in response.predictions:
    print(prediction)

For more information on ```<start_of_turn>``` and ```<end_of_turn>```, refer to this documentation https://ai.google.dev/gemma/docs/core/prompt-structure