In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Finetuning Llama 3 with Text on Vertex AI

Adapted from: [Vertex AI Model Garden - Llama 3 Finetuning on GitHub](https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_llama3_finetuning.ipynb)

Modified by: [Wan Qi Ang](https://github.com/angwanqi)

Last updated: 16 April 2025

<table><tbody><tr>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fcommunity%2Fmodel_garden%2Fmodel_garden_pytorch_llama3_finetuning.ipynb">
      <img alt="Google Cloud Colab Enterprise logo" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" width="32px"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_llama3_finetuning.ipynb">
      <img alt="GitHub logo" src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" width="32px"><br> View on GitHub
    </a>
  </td>
</tr></tbody></table>

## Overview

This notebook demonstrates finetuning and deploying Llama 3 models with Vertex AI. All of the examples in this notebook use parameter efficient finetuning methods [PEFT (LoRA)](https://github.com/huggingface/peft) to reduce training and storage costs. LoRA (Low-Rank Adaptation) is one approach of Parameter Efficient FineTuning (PEFT), where pretrained model weights are frozen and rank decomposition matrices representing the change in model weights are trained during finetuning. Read more about LoRA in the following publication: [Hu, E.J., Shen, Y., Wallis, P., Allen-Zhu, Z., Li, Y., Wang, S., Wang, L. and Chen, W., 2021. Lora: Low-rank adaptation of large language models. *arXiv preprint arXiv:2106.09685*](https://arxiv.org/abs/2106.09685).

After finetuning, we can deploy models on Vertex with GPU.


### Objective

- Finetune Llama 3 models with Vertex AI Custom Training Jobs.
- Deploy finetuned Llama 3 models on Vertex AI Prediction.
- Send prediction requests to your finetuned Llama 3 models.


### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Before you begin

## Before you begin
### Install Python Packages for Finetuning

In [None]:
# Import the necessary packages
# Upgrade Vertex AI SDK.
! pip3 install --upgrade --quiet 'google-cloud-aiplatform>=1.64.0'
! rm -rf vertex-ai-samples && git clone https://github.com/GoogleCloudPlatform/vertex-ai-samples.git

### Restart current runtime
To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which will restart the current kernel.
<div class="alert alert-block alert-success"> 
<b>NOTE:</b> Only restart the current runtime if you installed libraries. If you did not install new libraries, you do not need to restart the kernel.
</div>

In [None]:
# Restart kernel after installs so that your environment can access the new packages
import IPython
import time

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

### Import neccessary libraries

In [None]:
import datetime
import importlib
import os
import uuid
from typing import Tuple

from google.cloud import aiplatform

common_util = importlib.import_module(
    "vertex-ai-samples.community-content.vertex_model_garden.model_oss.notebook_util.common_util"
)

### Set Project ID

In [None]:
# Get the default cloud project id.
PROJECT_ID= !(gcloud config get-value core/project)
PROJECT_ID = PROJECT_ID[0]

# # Get the default cloud project id.
# PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]

# # Get the default region for launching jobs.
# if not REGION:
#     if not os.environ.get("GOOGLE_CLOUD_REGION"):
#         raise ValueError(
#             "REGION must be set. See"
#             " https://cloud.google.com/vertex-ai/docs/general/locations for"
#             " available cloud locations."
#         )
#     REGION = os.environ["GOOGLE_CLOUD_REGION"]

### Set variables

In [None]:
# Set the default region for launching jobs.
REGION = "asia-southeast1"

models, endpoints = {}, {}
# Dedicated endpoint not supported yet
use_dedicated_endpoint = False

print(f"Project ID:", PROJECT_ID)
print(f"Project Region:", REGION)

### Enable Necessary APIs

In [None]:
# Enable the Vertex AI API and Compute Engine API, if not already.
print("Enabling Vertex AI API and Compute Engine API.")
! gcloud services enable aiplatform.googleapis.com compute.googleapis.com

### Set (or create) the Google Cloud Storage bucket

<div class="alert alert-block alert-info"> 
<b>INPUT REQUIRED:</b> Replace <YOUR_NAME> with your name so that you'll be able to identify your Google Cloud Storage bucket later on. <b>Example:</b> BUCKET_PREFIX = "john"
</div>

In [None]:
# Update the bucket prefix with your name
PREFIX = "<YOUR_NAME>"

# Concatenate to get the full bucket name
BUCKET_NAME = PREFIX + "-llama3-tuning"
BUCKET_URI = f"gs://{BUCKET_NAME}"

STAGING_BUCKET = os.path.join(BUCKET_URI, "temporal")
# EXPERIMENT_BUCKET = os.path.join(BUCKET_URI, "peft")
MODEL_BUCKET = os.path.join(BUCKET_URI, "llama3")


print(f"ROOT_BUCKET_URI:", BUCKET_URI)
print(f"STAGING_BUCKET_URI:", STAGING_BUCKET)
# print(f"EXPERIMENT_BUCKET_URI:", EXPERIMENT_BUCKET)
print(f"MODEL_BUCKET_URI:", MODEL_BUCKET)

In [None]:
# Create Cloud Storage Bucket
!gcloud storage buckets create $BUCKET_URI --location=$REGION

### Setting up the Compute Engine Service Account
#### Retrieve the default Compute Engine Service Account

In [None]:
# Gets the default SERVICE_ACCOUNT.
shell_output = ! gcloud projects describe $PROJECT_ID
project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"
print("Using this default Service Account:", SERVICE_ACCOUNT)

#### Assign Cloud Storage admin IAM role to the Service Account

In [None]:
# Provision permissions to the SERVICE_ACCOUNT with the GCS bucket
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.admin $BUCKET_NAME

! gcloud config set project $PROJECT_ID
! gcloud projects add-iam-policy-binding --no-user-output-enabled {PROJECT_ID} --member=serviceAccount:{SERVICE_ACCOUNT} --role="roles/storage.admin"
! gcloud projects add-iam-policy-binding --no-user-output-enabled {PROJECT_ID} --member=serviceAccount:{SERVICE_ACCOUNT} --role="roles/aiplatform.user"

### Initialize Vertex AI SDK for Python

Initialize the Vertex AI SDK for Python for your project and corresponding bucket.

In [None]:
# Initialize Vertex AI API.
print("Initializing Vertex AI API.")
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

## Access Llama 3 models

For GPU based finetuning and serving, choose between accessing Llama 3 models on [Hugging Face](https://huggingface.co/)
or Vertex AI as described below.

If you already obtained access to Llama 3 models on [Hugging Face](https://huggingface.co/), you can load models from there.
Alternatively, you can also load the original Llama 3 models for finetuning and serving from Vertex AI after accepting the agreement.

In [None]:
LOAD_MODEL_FROM = "Hugging Face"  # Options: ["Hugging Face", "Google Cloud"]

### Option 1: Access Llama 3 models on Hugging Face for GPU based finetuning and serving
You must provide a Hugging Face User Access Token (read) to access the Llama 3 models. You can follow the [Hugging Face documentation](https://huggingface.co/docs/hub/en/security-tokens) to create a **read** access token and put it in the `HF_TOKEN` field below.

*--- Or ---*
### Option 2: Access Llama 3 models on Vertex AI for GPU based serving
The original models from Meta are converted into the Hugging Face format for serving in Vertex AI.
Accept the model agreement to access the models:
1. Open the [Llama 3 model card](https://console.cloud.google.com/vertex-ai/publishers/meta/model-garden/llama3) from [Vertex AI Model Garden](https://cloud.google.com/model-garden).
2. Review and accept the agreement in the pop-up window on the model card page. If you have previously accepted the model agreement, there will not be a pop-up window on the model card page and this step is not needed.
3. After accepting the agreement of Llama 3, a `gs://` URI containing Llama 3 pretrained and finetuned models will be shared.
4. Paste the URI in the `VERTEX_AI_MODEL_GARDEN_LLAMA3` field below.

In [None]:
HF_TOKEN = ""
VERTEX_AI_MODEL_GARDEN_LLAMA3 = ""

In [None]:
if LOAD_MODEL_FROM == "Hugging Face":
    assert (HF_TOKEN), "Provide a read HF_TOKEN to load models from Hugging Face, or select a different model source."
    
if LOAD_MODEL_FROM == "Google Cloud":
    assert (
        VERTEX_AI_MODEL_GARDEN_LLAMA3
    ), "Click the agreement of Llama 3 in Vertex AI Model Garden, and get the GCS path of Llama 3 model artifacts."
    print(
        "Copying Llama 3 model artifacts from",
        VERTEX_AI_MODEL_GARDEN_LLAMA3,
        "to ",
        MODEL_BUCKET,
    )
    HF_TOKEN = ""

    ! gsutil -m cp -R $VERTEX_AI_MODEL_GARDEN_LLAMA3/* $MODEL_BUCKET

## Finetune with HuggingFace PEFT and deploy with vLLM on GPUs
This notebook uses [timdettmers/openassistant-guanaco](https://huggingface.co/datasets/timdettmers/openassistant-guanaco) dataset as an example.
You can set `dataset_name` to any existing [Hugging Face dataset](https://huggingface.co/datasets) name, and set `instruct_column_in_dataset` to the name of the dataset column containing training data. The [timdettmers/openassistant-guanaco](https://huggingface.co/datasets/timdettmers/openassistant-guanaco) has only one column `text`, and therefore we set `instruct_column_in_dataset` to `text` in this notebook.

### Set dataset

In [None]:
# Template name or gs:// URI to a custom template.
template = "openassistant-guanaco"

# Hugging Face dataset name or gs:// URI to a custom JSONL dataset.
train_dataset_name = "timdettmers/openassistant-guanaco"  
train_split_name = "train"  
eval_dataset_name = "timdettmers/openassistant-guanaco"  
eval_split_name = "test"  

# Name of the dataset column containing training text input.
instruct_column_in_dataset = "text"  

In [None]:
# Load the data and have a look at what's in it
splits = {'train': 'openassistant_best_replies_train.jsonl', 'test': 'openassistant_best_replies_eval.jsonl'}
df = pd.read_json("hf://datasets/timdettmers/openassistant-guanaco/" + splits["train"], lines=True)

# Print the first 2 rows
df.head(2)

In [None]:
# Let's print the first row of data
df["text"][0]

## Finetune
This section demonstrates how to finetune the Llama 3 text only model and merge the finetuned LoRA adapter with the base model on Vertex AI. It uses the Vertex AI SDK to create and run the custom training jobs.

The training job takes approximately between 10 to 20 mins to set-up. Once done, the training job is expected to take around <time taken?> with the default configuration. To find the training time, throughput, and memory usage of your training job, you can go to the training logs and check the log line of the last training epoch.

**Note**:
1. We recommend setting `finetuning_precision_mode` to `4bit` because it enables using fewer hardware resources for finetuning.
2. We recommend using NVIDIA_L4 for 8B models and NVIDIA_A100_80GB for 70B models.
3. If `max_steps>0`, it will precedence over `epochs`. One can set a small `max_steps` value to quickly check the pipeline.
4. With the default setting, training takes between 1.5 ~ 2 hours.

### Set variables for training

#### Set the base model ID that you would like to tune

In [None]:
# Select a model variant of Llama 3.
# All options - ["meta-llama/Meta-Llama-3-8B", "meta-llama/Meta-Llama-3-8B-Instruct", "meta-llama/Meta-Llama-3-70B", "meta-llama/Meta-Llama-3-70B-Instruct"]

MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"

In [None]:
# The accelerator to use - ["NVIDIA_L4", "NVIDIA_A100_80GB"]
accelerator_type = "NVIDIA_L4"

In [None]:
TRAIN_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-peft-train:stable_20240909"

if LOAD_MODEL_FROM == "Google Cloud":
    if MODEL_ID == "meta-llama/Meta-Llama-3-8B":
        base_model_id = "llama3-8b-hf"
    elif MODEL_ID == "meta-llama/Meta-Llama-3-8B-Instruct":
        base_model_id = "llama3-8b-chat-hf"
    elif MODEL_ID == "meta-llama/Meta-Llama-3-70B":
        base_model_id = "llama3-70b-hf"
    elif MODEL_ID == "meta-llama/Meta-Llama-3-70B-Instruct":
        base_model_id = "llama3-70b-chat-hf"
    else:
        raise ValueError(f"Undefined model ID: {MODEL_ID}.")
    base_model_id = os.path.join(MODEL_BUCKET, base_model_id)
else:
    base_model_id = MODEL_ID

# Batch size for finetuning.
per_device_train_batch_size = 1 
gradient_accumulation_steps = 8 
# Maximum sequence length.
max_seq_length = 4096 
# Setting a positive `max_steps` here will override `num_epochs`
max_steps = -1 
num_epochs = 1.0  # @param{type:"number"}
# Precision mode for finetuning.
finetuning_precision_mode = "4bit"  # @param ["4bit", "8bit", "float16"]
# Learning rate.
learning_rate = 5e-5  # @param{type:"number"}
lr_scheduler_type = "cosine"  # @param{type:"string"}
# LoRA parameters.
lora_rank = 16 
lora_alpha = 32 
lora_dropout = 0.05  # @param{type:"number"}
enable_gradient_checkpointing = True
attn_implementation = "flash_attention_2"
optimizer = "paged_adamw_32bit"
warmup_ratio = "0.01"
report_to = "tensorboard"
save_steps = 10
logging_steps = save_steps

# Worker pool spec.
machine_type = None
if "8b" in MODEL_ID.lower():
    if accelerator_type == "NVIDIA_L4":
        accelerator_count = 4
        machine_type = "g2-standard-48"
    elif accelerator_type == "NVIDIA_A100_80GB":
        accelerator_count = 1
        machine_type = "a2-ultragpu-1g"
    else:
        raise ValueError(
            f"Recommended machine settings not found for: {accelerator_type}. To use another accelerator, edit this code block to pass in an appropriate `machine_type`, `accelerator_type`, and `accelerator_count` to the deploy_model_vllm function by clicking `Show Code` and then modifying the code."
        )
elif "70b" in MODEL_ID.lower():
    if accelerator_type == "NVIDIA_A100_80GB":
        accelerator_count = 4
        machine_type = "a2-ultragpu-4g"
    else:
        raise ValueError(
            f"Recommended machine settings not found for: {accelerator_type}. To use another accelerator, edit this code block to pass in an appropriate `machine_type`, `accelerator_type`, and `accelerator_count` to the deploy_model_vllm function by clicking `Show Code` and then modifying the code."
        )
else:
    raise ValueError(f"Unsupported model ID or GCS path: {MODEL_ID}.")

replica_count = 1

common_util.check_quota(
    project_id=PROJECT_ID,
    region=REGION,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    is_for_training=True,
)

job_name = common_util.get_job_name_with_datetime("llama3-lora-train")

base_output_dir = os.path.join(STAGING_BUCKET, job_name)
# Create a GCS folder to store the LORA adapter.
lora_output_dir = os.path.join(base_output_dir, "adapter")
# Create a GCS folder to store the merged model with the base model and the
# finetuned LORA adapter.
merged_model_output_dir = os.path.join(base_output_dir, "merged-model")

# Add labels for the finetuning job.
labels = {
    "mg-source": "notebook",
    "mg-notebook-name": "model_garden_pytorch_llama3_finetuning.ipynb".split(".")[0],
}

labels["mg-tune"] = "publishers-meta-models-llama3"
versioned_model_id = base_model_id.split("/")[1].lower().replace(".", "-")
labels["versioned-mg-tune"] = f"{labels['mg-tune']}-{versioned_model_id}"

eval_args = [
    f"--eval_dataset_path={eval_dataset_name}",
    f"--eval_column={instruct_column_in_dataset}",
    f"--eval_template={template}",
    f"--eval_split={eval_split_name}",
    f"--eval_steps={save_steps}",
    "--eval_tasks=builtin_eval",
    "--eval_metric_name=loss",
]

train_job_args = [
    "--config_file=vertex_vision_model_garden_peft/deepspeed_zero2_4gpu.yaml",
    "--task=instruct-lora",
    "--completion_only=True",
    f"--pretrained_model_id={base_model_id}",
    f"--dataset_name={train_dataset_name}",
    f"--train_split_name={train_split_name}",
    f"--instruct_column_in_dataset={instruct_column_in_dataset}",
    f"--output_dir={lora_output_dir}",
    f"--merge_base_and_lora_output_dir={merged_model_output_dir}",
    f"--per_device_train_batch_size={per_device_train_batch_size}",
    f"--gradient_accumulation_steps={gradient_accumulation_steps}",
    f"--lora_rank={lora_rank}",
    f"--lora_alpha={lora_alpha}",
    f"--lora_dropout={lora_dropout}",
    f"--max_steps={max_steps}",
    f"--max_seq_length={max_seq_length}",
    f"--learning_rate={learning_rate}",
    f"--lr_scheduler_type={lr_scheduler_type}",
    f"--precision_mode={finetuning_precision_mode}",
    f"--enable_gradient_checkpointing={enable_gradient_checkpointing}",
    f"--num_epochs={num_epochs}",
    f"--attn_implementation={attn_implementation}",
    f"--optimizer={optimizer}",
    f"--warmup_ratio={warmup_ratio}",
    f"--report_to={report_to}",
    f"--logging_output_dir={base_output_dir}",
    f"--save_steps={save_steps}",
    f"--logging_steps={logging_steps}",
    f"--template={template}",
    f"--huggingface_access_token={HF_TOKEN}",
] + eval_args

# Create TensorBoard
tensorboard = aiplatform.Tensorboard.create(job_name)
exp = aiplatform.TensorboardExperiment.create(
    tensorboard_experiment_id=job_name, tensorboard_name=tensorboard.name
)

# Pass training arguments and launch job.
train_job = aiplatform.CustomContainerTrainingJob(
    display_name=job_name,
    container_uri=TRAIN_DOCKER_URI,
    labels=labels,
)

train_job.run(
    args=train_job_args,
    environment_variables={"WANDB_DISABLED": True},
    replica_count=replica_count,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    boot_disk_size_gb=500,
    service_account=SERVICE_ACCOUNT,
    tensorboard=tensorboard.resource_name,
    base_output_dir=base_output_dir,
)

print("LoRA adapter was saved in: ", lora_output_dir)
print("Trained and merged models were saved in: ", merged_model_output_dir)

### Run TensorBoard
 This section shows how to launch TensorBoard in a [Cloud Shell](https://cloud.google.com/shell/docs).
 1. Click the Cloud Shell icon(![terminal](https://github.com/google/material-design-icons/blob/master/png/action/terminal/materialicons/24dp/1x/baseline_terminal_black_24dp.png?raw=true)) on the top right to open the Cloud Shell.
 2. Copy the `tensorboard` command shown below by running this cell.
 3. Paste and run the command in the Cloud Shell to launch TensorBoard.
 4. Once the command runs (You may have to click `Authorize` if prompted), click the link starting with `http://localhost`.

 Note: You may need to wait around 10 minutes after the job starts in order for the TensorBoard logs to be written to the GCS bucket.
print(f"Command to copy: tensorboard --logdir {base_output_dir}/logs")

## Deploy
This section uploads the model to Model Registry and deploys it on the Endpoint. It takes 15 minutes to 1 hour to finish.

In [None]:
# Uncomment this code to reload your model
# merged_model_output_dir = "<REPLACE_WITH_MODEL_ARTIFACTS_GCS_URI>" # E.g. gs://<BUCKET_NAME>/temporal/<JOB_NAME>/merged-model"
# MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"

In [None]:
print("Deploying models in: ", merged_model_output_dir)

# The pre-built serving docker image for vLLM.
VLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20240721_0916_RC00"

accelerator_type = "NVIDIA_L4"  # @param ["NVIDIA_L4", "NVIDIA_H100_80GB"]
machine_type = None

# Find Vertex AI prediction supported accelerators and regions in [here](https://cloud.google.com/vertex-ai/docs/predictions/configure-compute).
if "8b" in MODEL_ID.lower():
    if accelerator_type == "NVIDIA_L4":
        machine_type = "g2-standard-12"
        accelerator_count = 1
    elif accelerator_type == "NVIDIA_H100_80GB":
        machine_type = "a3-highgpu-2g"
        accelerator_count = 2
else:
    if accelerator_type == "NVIDIA_L4":
        machine_type = "g2-standard-96"
        accelerator_count = 8
    elif accelerator_type == "NVIDIA_H100_80GB":
        machine_type = "a3-highgpu-4g"
        accelerator_count = 4

if machine_type is None:
    raise ValueError(
        f"Recommended GPU setting not found for: {accelerator_type} and {MODEL_ID.lower()}."
    )

common_util.check_quota(
    project_id=PROJECT_ID,
    region=REGION,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    is_for_training=False,
)

gpu_memory_utilization = 0.85
max_model_len = 8192  # Maximum context length.

# Ensure max_model_len does not exceed the limit
if max_model_len > 8192:
    raise ValueError("max_model_len cannot exceed 8192")


def deploy_model_vllm(
    model_name: str,
    model_id: str,
    publisher: str,
    publisher_model_id: str,
    service_account: str,
    base_model_id: str = None,
    machine_type: str = "g2-standard-8",
    accelerator_type: str = "NVIDIA_L4",
    accelerator_count: int = 1,
    gpu_memory_utilization: float = 0.9,
    max_model_len: int = 4096,
    dtype: str = "auto",
    enable_trust_remote_code: bool = False,
    enforce_eager: bool = False,
    enable_lora: bool = False,
    enable_chunked_prefill: bool = False,
    enable_prefix_cache: bool = False,
    host_prefix_kv_cache_utilization_target: float = 0.0,
    max_loras: int = 1,
    max_cpu_loras: int = 8,
    use_dedicated_endpoint: bool = False,
    max_num_seqs: int = 256,
    model_type: str = None,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys trained models with vLLM into Vertex AI."""
    endpoint = aiplatform.Endpoint.create(
        display_name=f"{model_name}-endpoint",
        dedicated_endpoint_enabled=use_dedicated_endpoint,
    )

    if not base_model_id:
        base_model_id = model_id

    # See https://docs.vllm.ai/en/latest/models/engine_args.html for a list of possible arguments with descriptions.
    vllm_args = [
        "python",
        "-m",
        "vllm.entrypoints.api_server",
        "--host=0.0.0.0",
        "--port=8080",
        f"--model={model_id}",
        f"--tensor-parallel-size={accelerator_count}",
        "--swap-space=16",
        f"--gpu-memory-utilization={gpu_memory_utilization}",
        f"--max-model-len={max_model_len}",
        f"--dtype={dtype}",
        f"--max-loras={max_loras}",
        f"--max-cpu-loras={max_cpu_loras}",
        f"--max-num-seqs={max_num_seqs}",
        "--disable-log-stats",
    ]

    if enable_trust_remote_code:
        vllm_args.append("--trust-remote-code")

    if enforce_eager:
        vllm_args.append("--enforce-eager")

    if enable_lora:
        vllm_args.append("--enable-lora")

    if enable_chunked_prefill:
        vllm_args.append("--enable-chunked-prefill")

    if enable_prefix_cache:
        vllm_args.append("--enable-prefix-caching")

    if 0 < host_prefix_kv_cache_utilization_target < 1:
        vllm_args.append(
            f"--host-prefix-kv-cache-utilization-target={host_prefix_kv_cache_utilization_target}"
        )

    if model_type:
        vllm_args.append(f"--model-type={model_type}")

    env_vars = {
        "MODEL_ID": base_model_id,
        "DEPLOY_SOURCE": "notebook",
    }

    # HF_TOKEN is not a compulsory field and may not be defined.
    try:
        if HF_TOKEN:
            env_vars["HF_TOKEN"] = HF_TOKEN
    except NameError:
        pass

    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=VLLM_DOCKER_URI,
        serving_container_args=vllm_args,
        serving_container_ports=[8080],
        serving_container_predict_route="/generate",
        serving_container_health_route="/ping",
        serving_container_environment_variables=env_vars,
        serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
        serving_container_deployment_timeout=7200,
        model_garden_source_model_name=(
            f"publishers/{publisher}/models/{publisher_model_id}"
        ),
    )
    print(
        f"Deploying {model_name} on {machine_type} with {accelerator_count} {accelerator_type} GPU(s)."
    )
    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=1800,
        service_account=service_account,
        system_labels={
            "NOTEBOOK_NAME": "model_garden_pytorch_llama3_finetuning.ipynb",
            # "NOTEBOOK_ENVIRONMENT": "notebook",
            "NOTEBOOK_ENVIRONMENT": common_util.get_deploy_source(),
        },
    )
    print("endpoint_name:", endpoint.name)

    return model, endpoint


models["vllm_gpu"], endpoints["vllm_gpu"] = deploy_model_vllm(
    model_name=common_util.get_job_name_with_datetime(prefix="llama3-vllm-serve"),
    model_id=merged_model_output_dir,
    publisher="meta",
    publisher_model_id="llama3",
    service_account=SERVICE_ACCOUNT,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    gpu_memory_utilization=gpu_memory_utilization,
    max_model_len=max_model_len,
    use_dedicated_endpoint=use_dedicated_endpoint,
)

### Predict

Once deployment succeeds, you can send requests to the endpoint with text prompts. Sampling parameters supported by vLLM can be found [here](https://docs.vllm.ai/en/latest/dev/sampling_params.html).

Example:

```
Human: What is a car?
Assistant:  A car, or a motor car, is a road-connected human-transportation system used to move people or goods from one place to another. The term also encompasses a wide range of vehicles, including motorboats, trains, and aircrafts. Cars typically have four wheels, a cabin for passengers, and an engine or motor. They have been around since the early 19th century and are now one of the most popular forms of transportation, used for daily commuting, shopping, and other purposes.
```
Additionally, you can moderate the generated text with Vertex AI. See [Moderate text documentation](https://cloud.google.com/natural-language/docs/moderating-text) for more details.

- If you encounter an issue like `ServiceUnavailable: 503 Took too long to respond when processing`, you can reduce the maximum number of output tokens, by lowering `max_tokens`.
-  Set `raw_response` to `True` to obtain the raw model output. Set `raw_response` to `False` to apply additional formatting in the structure of `"Prompt:\n{prompt.strip()}\nOutput:\n{output}"`.

In [None]:
# Loads an existing endpoint instance using the endpoint name:
# - Using `endpoint_name = endpoint.name` allows us to get the
#   endpoint name of the endpoint `endpoint` created in the cell
#   above.
# - Alternatively, you can set `endpoint_name = "1234567890123456789"` to load
#   an existing endpoint with the ID 1234567890123456789.
# You may uncomment the code below to load an existing endpoint.

# endpoint_name = ""  
# aip_endpoint_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
# )
# endpoint = aiplatform.Endpoint(aip_endpoint_name)

In [None]:
prompt = "What is a car?"  
max_tokens = 200  
temperature = 1.0  
top_p = 1.0  
top_k = 1  
raw_response = False  

# Overrides parameters for inferences.
instances = [
    {
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "top_p": top_p,
        "top_k": top_k,
        "raw_response": raw_response,
    },
]
response = endpoints["vllm_gpu"].predict(
    instances=instances, use_dedicated_endpoint=use_dedicated_endpoint
)

for prediction in response.predictions:
    print(prediction)

## Clean up resources
### Delete the model and endpoint
Delete the experiment models and endpoints to recycle the resources and avoid unnecessary continuous charges that may incur.

In [None]:
train_job.delete()

# Undeploy model and delete endpoint.
for endpoint in endpoints.values():
    endpoint.delete(force=True)

# Delete models.
for model in models.values():
    model.delete()

delete_bucket = False  
if delete_bucket:
    ! gsutil -m rm -r $BUCKET_NAME