In [30]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden - Llama 3.1 Finetuning

<table><tbody><tr>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fcommunity%2Fmodel_garden%2Fmodel_garden_pytorch_llama3_1_finetuning.ipynb">
      <img alt="Google Cloud Colab Enterprise logo" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" width="32px"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_llama3_1_finetuning.ipynb">
      <img alt="GitHub logo" src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" width="32px"><br> View on GitHub
    </a>
  </td>
</tr></tbody></table>

## Overview

This notebook demonstrates finetuning and deploying Llama 3.1 models with Vertex AI. All of the examples in this notebook use parameter efficient finetuning methods [PEFT (LoRA)](https://github.com/huggingface/peft) to reduce training and storage costs. LoRA (Low-Rank Adaptation) is one approach of Parameter Efficient FineTuning (PEFT), where pretrained model weights are frozen and rank decomposition matrices representing the change in model weights are trained during finetuning.
After finetuning, we can deploy models on Vertex with GPU.


### Objective

- Finetune Llama 3.1 models with Vertex AI Custom Training Jobs.
- Deploy finetuned Llama 3.1 models on Vertex AI Prediction.
- Send prediction requests to your finetuned Llama 3.1 models.


### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage


## Before you begin

### Install dependencies

In [31]:
print("Installing google-cloud-aiplatform")
! pip3 install -q google-cloud-aiplatform
print("Installing tensorflow")
!pip3 install -q tensorflow

print("Installing dependencies complete")

Installing google-cloud-aiplatform
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Installing tensorflow
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Installing dependencies complete


### Setup Google Cloud project

1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

2. [Make sure that you have GPU quota for Vertex Training (finetuing) and Vertex Prediction (serving)](https://cloud.google.com/docs/quotas/view-manage). The quota name for Vertex Training is "Custom model training your-gpu-type per region" and the quota name for Vertex Prediction is "Custom model serving your-gpu-type per region" such as `Custom model training Nvidia L4 GPUs per region` and `Custom model serving Nvidia L4 GPUs per region` for L4 GPUs. [Submit a quota increase request](https://cloud.google.com/docs/quotas/view-manage#requesting_higher_quota) if additional quota is needed. At minimum, running this notebook requires 4 L4s for finetuning and 1 L4 for serving. More GPUs may be needed for larger models and different finetuning configurations. To secure GPUs for larger models, ask your customer engineer to get you allowlisted for a Shared Reservation or a Dynamic Workload Scheduler.


### Import the necessary packages

In [32]:
! git clone https://github.com/GoogleCloudPlatform/vertex-ai-samples.git

fatal: destination path 'vertex-ai-samples' already exists and is not an empty directory.


### Define environment variables

Set the following variables for the experiment environment. The specified Cloud Storage bucket (`BUCKET_URI`) should be located in the specified region (`REGION`). Note that a multi-region bucket (eg. "us") is not considered a match for a single region covered by the multi-region range (eg. "us-central1").

In [33]:
import importlib
import os
import uuid
from datetime import datetime
from typing import Tuple

from google.cloud import aiplatform
import tensorflow

import socket
import re

common_util = importlib.import_module(
    "vertex-ai-samples.community-content.vertex_model_garden.model_oss.notebook_util.common_util"
)
models, endpoints = {}, {}

UNIQUE_PREFIX = "llama31" #socket.gethostname()
UNIQUE_PREFIX = re.sub('[^A-Za-z0-9]+', '', UNIQUE_PREFIX)

# Cloud project id.
PROJECT_IDS = !(gcloud config get-value core/project)
PROJECT_ID = PROJECT_IDS[0]  # @param {type:"string"}

# The region you want to launch jobs in.
PREFIX_NUM_ONLY=int(str(re.search(r'\d+', UNIQUE_PREFIX).group()))
REGION_ALLOCATE=PREFIX_NUM_ONLY%3
if REGION_ALLOCATE == 0:
    REGION = "asia-southeast1"
elif REGION_ALLOCATE == 1:
    REGION = "us-central1"
    print("region is not ASIA Southeast")
else:
    REGION = "europe-west4"

REGION = "asia-southeast1"
# Enable the Vertex AI API and Compute Engine API, if not already.
print("Enabling Vertex AI API and Compute Engine API.")
! gcloud services enable aiplatform.googleapis.com compute.googleapis.com

# The service account looks like:
# '@.iam.gserviceaccount.com'
# Please go to https://cloud.google.com/iam/docs/service-accounts-create#iam-service-accounts-create-console
# and create service account with `Vertex AI User` and `Storage Object Admin` roles.
# The service account for deploying fine tuned model.
SERVICE_ACCOUNT = !(gcloud config get-value core/account)  # @param {type:"string"}
SERVICE_ACCOUNT = SERVICE_ACCOUNT[0]  # @param {type:"string"}

# The Cloud Storage bucket for storing experiments output.
# Remove prefix gs://, e.g. foo_bucket.
BUCKET_NAME = f"{PROJECT_ID}-{UNIQUE_PREFIX}-{REGION}"
BUCKET_URI = f"gs://{BUCKET_NAME}"  # @param {type:"string"}
print(f"Using this GCS Bucket: {BUCKET_URI}")

! gcloud config set project $PROJECT_ID
! gcloud storage buckets create {BUCKET_URI} --project={PROJECT_ID} --location={REGION}

STAGING_BUCKET = os.path.join(BUCKET_URI, "temporal")
MODEL_BUCKET = os.path.join(BUCKET_URI, "llama3_1")

region is not ASIA Southeast
Enabling Vertex AI API and Compute Engine API.
Operation "operations/acat.p2-255766800726-d8e445bf-34f7-4336-9809-18ebbeb61708" finished successfully.
Using this GCS Bucket: gs://my-project-0004-346516-llama31-asia-southeast1
Updated property [core/project].
Creating gs://my-project-0004-346516-llama31-asia-southeast1/...


### Initialize Vertex AI API

In [34]:
print("Initializing Vertex AI API.")
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

Initializing Vertex AI API.


### The pre-built serving docker image.

In [35]:
VLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20240721_0916_RC00"

## Access Llama 3.1 models on Vertex AI for GPU based serving
The original models from Meta are converted into the Hugging Face format for serving in Vertex AI.
Accept the model agreement to access the models:
1. Open the [Llama 3.1 model card](https://console.cloud.google.com/vertex-ai/publishers/meta/model-garden/llama3_1) from [Vertex AI Model Garden](https://cloud.google.com/model-garden).
2. Review and accept the agreement in the pop-up window on the model card page. If you have previously accepted the model agreement, there will not be a pop-up window on the model card page and this step is not needed.
3. After accepting the agreement of Llama 3.1, a `gs://` URI containing Llama 3.1 pretrained and finetuned models will be shared.
4. Paste the URI in the `VERTEX_AI_MODEL_GARDEN_LLAMA3_1` field below.

In [36]:
# The Llama 3.1 base model.
base_model_name = "Meta-Llama-3.1-8B"  # @param ["meta-llama/Meta-Llama-3.1-8B", "meta-llama/Meta-Llama-3.1-8B-Instruct", "meta-llama/Meta-Llama-3.1-70B", "meta-llama/Meta-Llama-3.1-70B-Instruct"] {isTemplate:true}
! gsutil -m cp -R gs://vertex-model-garden-public-us/llama3.1/{base_model_name} {MODEL_BUCKET}

Copying gs://vertex-model-garden-public-us/llama3.1/Meta-Llama-3.1-8B/README.md [Content-Type=text/markdown]...
Copying gs://vertex-model-garden-public-us/llama3.1/Meta-Llama-3.1-8B/config.json [Content-Type=application/json]...
Copying gs://vertex-model-garden-public-us/llama3.1/Meta-Llama-3.1-8B/generation_config.json [Content-Type=application/json]...
Copying gs://vertex-model-garden-public-us/llama3.1/Meta-Llama-3.1-8B/model-00001-of-00004.safetensors [Content-Type=application/octet-stream]...
Copying gs://vertex-model-garden-public-us/llama3.1/Meta-Llama-3.1-8B/model-00002-of-00004.safetensors [Content-Type=application/octet-stream]...
Copying gs://vertex-model-garden-public-us/llama3.1/Meta-Llama-3.1-8B/model-00004-of-00004.safetensors [Content-Type=application/octet-stream]...
Copying gs://vertex-model-garden-public-us/llama3.1/Meta-Llama-3.1-8B/model-00003-of-00004.safetensors [Content-Type=application/octet-stream]...
Copying gs://vertex-model-garden-public-us/llama3.1/Meta-Ll

### Define common functions

In [37]:
def deploy_model_vllm(
    model_name: str,
    model_id: str,
    service_account: str,
    machine_type: str = "g2-standard-8",
    accelerator_type: str = "NVIDIA_L4",
    accelerator_count: int = 1,
    gpu_memory_utilization: float = 0.9,
    max_model_len: int = 8192,
    max_loras: int = 1,
    max_cpu_loras: int = 16,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys trained models with vLLM into Vertex AI."""
    endpoint = aiplatform.Endpoint.create(display_name=f"{model_name}-endpoint")

    vllm_args = [
        "python",
        "-m",
        "vllm.entrypoints.api_server",
        "--host=0.0.0.0",
        "--port=7080",
        f"--model={model_id}",
        f"--tensor-parallel-size={accelerator_count}",
        "--swap-space=16",
        f"--gpu-memory-utilization={gpu_memory_utilization}",
        f"--max-model-len={max_model_len}",
        f"--max-loras={max_loras}",
        f"--max-cpu-loras={max_cpu_loras}",
        "--disable-log-stats",
    ]

    env_vars = {
        "MODEL_ID": model_id,
        "DEPLOY_SOURCE": "notebook",
    }

    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=VLLM_DOCKER_URI,
        serving_container_args=vllm_args,
        serving_container_ports=[7080],
        serving_container_predict_route="/generate",
        serving_container_health_route="/ping",
        serving_container_environment_variables=env_vars,
        serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
        serving_container_deployment_timeout=7200,
    )
    print(
        f"Deploying {model_name} on {machine_type} with {accelerator_count} {accelerator_type} GPU(s)."
    )
    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=1800,
        service_account=service_account,
    )
    print("endpoint_name:", endpoint.name)

    return model, endpoint

## Finetune with HuggingFace PEFT and deploy with vLLM on GPUs

## Set dataset

Use the Vertex AI SDK to create and run the custom training jobs.

This notebook uses [timdettmers/openassistant-guanaco](https://huggingface.co/datasets/timdettmers/openassistant-guanaco) dataset as an example.
You can set `dataset_name` to any existing [Hugging Face dataset](https://huggingface.co/datasets) name, and set `instruct_column_in_dataset` to the name of the dataset column containing training data. The [timdettmers/openassistant-guanaco](https://huggingface.co/datasets/timdettmers/openassistant-guanaco) has only one column `text`, and therefore we set `instruct_column_in_dataset` to `text` in this notebook.

### (Optional) Prepare a custom JSONL dataset for finetuning

You can prepare a JSONL file where each line is a valid JSON string as your custom training dataset. For example, here is one line from the [timdettmers/openassistant-guanaco](https://huggingface.co/datasets/timdettmers/openassistant-guanaco) dataset:
```
{"text": "### Human: Hola### Assistant: \u00a1Hola! \u00bfEn qu\u00e9 puedo ayudarte hoy?"}
```

The JSON object has a key `text`, which should match `instruct_column_in_dataset`; The value should be one training data point, i.e. a string. After you prepared your JSONL file, you can either upload it to [Hugging Face datasets](https://huggingface.co/datasets) or [Google Cloud Storage](https://cloud.google.com/storage).

- To upload a JSONL dataset to [Hugging Face datasets](https://huggingface.co/datasets), follow the instructions on [Uploading Datasets](https://huggingface.co/docs/hub/en/datasets-adding). Then, set `dataset_name` to the name of your newly created dataset on Hugging Face.

- To upload a JSONL dataset to [Google Cloud Storage](https://cloud.google.com/storage), follow the instructions on [Upload objects from a filesystem](https://cloud.google.com/storage/docs/uploading-objects). Then, set `dataset_name` to the `gs://` URI to your JSONL file. For example: `gs://cloud-samples-data/vertex-ai/model-evaluation/peft_train_sample.jsonl`.

Optionally update the `instruct_column_in_dataset` field below if your JSON objects use a key other than the default `text`.

### (Optional) Format your data with custom JSON template

Sometimes, your dataset might have multiple text columns and you want to construct the training data with a template. You can prepare a JSON template in the following format:

```
{
  "description": "Template used by Llama 3.1, accepting text-bison format.",
  "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/models/tune-text-models-supervised#dataset-format",
  "prompt_input": "<|start_header_id|>user<|end_header_id|>\n\n{input_text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{output_text}<|eot_id|>",
  "instruction_separator": "<|start_header_id|>user<|end_header_id|>\n\n",
  "response_separator": "<|start_header_id|>assistant<|end_header_id|>\n\n"
}
```

As an example, the template above can be used to format the following training data (this line comes from `gs://cloud-samples-data/vertex-ai/model-evaluation/peft_train_sample.jsonl`):

```
{"input_text":"TRANSCRIPT: \nREASON FOR EVALUATION:,\n\n LABEL:","output_text":"Chiropractic"}
```

This example template simply concatenates `input_text` with `output_text` with some special tokens in between.

To try such custom dataset, you can make the following changes:
1. Set `template` to `llama3-text-bison`
1. Set `train_dataset_name` to `gs://cloud-samples-data/vertex-ai/model-evaluation/peft_train_sample.jsonl`
1. Set `train_split_name` to `train`
1. Set `eval_dataset_name` to `gs://cloud-samples-data/vertex-ai/model-evaluation/peft_eval_sample.jsonl`
1. Set `eval_split_name` to `train` (**NOT** `test`)
1. Set `instruct_column_in_dataset` as `input_text`.

In [38]:
# Template name or gs:// URI to a custom template.
template = "openassistant-guanaco"  # @param {type:"string"}

# Hugging Face dataset name or gs:// URI to a custom JSONL dataset.
train_dataset_name = "timdettmers/openassistant-guanaco"  # @param {type:"string"}
train_split_name = "train"  # @param {type:"string"}
eval_dataset_name = "timdettmers/openassistant-guanaco"  # @param {type:"string"}
eval_split_name = "test"  # @param {type:"string"}

# Name of the dataset column containing training text input.
instruct_column_in_dataset = "text"  # @param {type:"string"}

## Finetune
Use the Vertex AI SDK to create and run the custom training jobs.

**Note**:
1. We recommend setting `finetuning_precision_mode` to `4bit` because it enables using fewer hardware resources for finetuning.
1. We recommend using NVIDIA_L4 for 8B models and NVIDIA_A100_80GB for 70B models.
1. If `max_steps>0`, it will precedence over `epochs`. One can set a small `max_steps` value to quickly check the pipeline.
1. With the default setting, training takes between 1.5 ~ 2 hours.

In [39]:
TRAIN_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-peft-train:20240724_0936_RC00"

# The Llama 3.1 base model.
MODEL_ID = os.path.join(MODEL_BUCKET, base_model_name)
# The accelerator to use.
accelerator_type = "NVIDIA_L4"  # @param ["NVIDIA_L4", "NVIDIA_A100_80GB"]
accelerator_count = 4

# Batch size for finetuning.
per_device_train_batch_size = 1  # @param{type:"integer"}
gradient_accumulation_steps = 8  # @param{type:"integer"}
# Maximum sequence length.
max_seq_length = 4096  # @param{type:"integer"}
# Setting a positive `max_steps` here will override `num_epochs`
max_steps = -1  # @param{type:"integer"}
num_epochs = 1.0  # @param{type:"number"}
# Precision mode for finetuning.
finetuning_precision_mode = "4bit"  # @param ["4bit", "8bit", "float16"]
# Learning rate.
learning_rate = 5e-5  # @param{type:"number"}
lr_scheduler_type = "cosine"  # @param{type:"string"}
# LoRA parameters.
lora_rank = 16  # @param{type:"integer"}
lora_alpha = 32  # @param{type:"integer"}
lora_dropout = 0.05  # @param{type:"number"}
enable_gradient_checkpointing = True
attn_implementation = "flash_attention_2"
optimizer = "paged_adamw_32bit"
warmup_ratio = "0.01"
report_to = "tensorboard"
save_steps = 10
logging_steps = save_steps

replica_count = 1

common_util.check_quota(
    project_id=PROJECT_ID,
    region=REGION,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    is_for_training=True,
)

job_name = common_util.get_job_name_with_datetime("llama3_1-lora-train").replace(
    "_", "-"
)

base_output_dir = os.path.join(STAGING_BUCKET, job_name)
# Create a GCS folder to store the LORA adapter.
lora_output_dir = os.path.join(base_output_dir, "adapter")
# Create a GCS folder to store the merged model with the base model and the
# finetuned LORA adapter.
merged_model_output_dir = os.path.join(base_output_dir, "merged-model")

eval_args = [
    f"--eval_dataset_path={eval_dataset_name}",
    f"--eval_column={instruct_column_in_dataset}",
    f"--eval_template={template}",
    f"--eval_split={eval_split_name}",
    f"--eval_steps={save_steps}",
    "--eval_tasks=builtin_eval",
    "--eval_metric_name=loss",
]

train_job_args = [
    "--config_file=vertex_vision_model_garden_peft/deepspeed_zero2_4gpu.yaml",
    "--task=instruct-lora",
    "--completion_only=True",
    f"--pretrained_model_id={MODEL_ID}",
    f"--dataset_name={train_dataset_name}",
    f"--train_split_name={train_split_name}",
    f"--instruct_column_in_dataset={instruct_column_in_dataset}",
    f"--output_dir={lora_output_dir}",
    f"--merge_base_and_lora_output_dir={merged_model_output_dir}",
    f"--per_device_train_batch_size={per_device_train_batch_size}",
    f"--gradient_accumulation_steps={gradient_accumulation_steps}",
    f"--lora_rank={lora_rank}",
    f"--lora_alpha={lora_alpha}",
    f"--lora_dropout={lora_dropout}",
    f"--max_steps={max_steps}",
    f"--max_seq_length={max_seq_length}",
    f"--learning_rate={learning_rate}",
    f"--lr_scheduler_type={lr_scheduler_type}",
    f"--precision_mode={finetuning_precision_mode}",
    f"--enable_gradient_checkpointing={enable_gradient_checkpointing}",
    f"--num_epochs={num_epochs}",
    f"--attn_implementation={attn_implementation}",
    f"--optimizer={optimizer}",
    f"--warmup_ratio={warmup_ratio}",
    f"--report_to={report_to}",
    f"--logging_output_dir={base_output_dir}",
    f"--save_steps={save_steps}",
    f"--logging_steps={logging_steps}",
    f"--template={template}",
] + eval_args

### Create TensorBoard

In [40]:
tensorboard = aiplatform.Tensorboard.create(job_name)
exp = aiplatform.TensorboardExperiment.create(
    tensorboard_experiment_id=job_name, tensorboard_name=tensorboard.name
)

# Pass training arguments and launch job.
train_job = aiplatform.CustomContainerTrainingJob(
    display_name=job_name,
    container_uri=TRAIN_DOCKER_URI,
)
# Worker pool spec.
try:
    accelerator_type = "NVIDIA_L4"
    machine_type = "g2-standard-48"
    train_job.run(
        args=train_job_args,
        environment_variables={"WANDB_DISABLED": True},
        replica_count=replica_count,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        boot_disk_size_gb=500,
        service_account=SERVICE_ACCOUNT,
        tensorboard=tensorboard.resource_name,
        base_output_dir=base_output_dir,
    )
except Exception as e:
    print(f"Error: {e}, NVIDIA_L4 failed, trying NVIDIA_A100_80GB / g2-standard-96	")
    accelerator_type = "NVIDIA_L4" #"NVIDIA_A100_80GB"
    machine_type = "g2-standard-96" #"a2-ultragpu-4g"
    train_job.run(
        args=train_job_args,
        environment_variables={"WANDB_DISABLED": True},
        replica_count=replica_count,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        boot_disk_size_gb=500,
        service_account=SERVICE_ACCOUNT,
        tensorboard=tensorboard.resource_name,
        base_output_dir=base_output_dir,
    )

print("LoRA adapter was saved in: ", lora_output_dir)
print("Trained and merged models were saved in: ", merged_model_output_dir)

Creating Tensorboard
Create Tensorboard backing LRO: projects/255766800726/locations/asia-southeast1/tensorboards/687784105592487936/operations/5507102449564909568
Tensorboard created. Resource name: projects/255766800726/locations/asia-southeast1/tensorboards/687784105592487936
To use this Tensorboard in another session:
tb = aiplatform.Tensorboard('projects/255766800726/locations/asia-southeast1/tensorboards/687784105592487936')
Creating TensorboardExperiment
TensorboardExperiment created. Resource name: projects/255766800726/locations/asia-southeast1/tensorboards/687784105592487936/experiments/llama3-1-lora-train-20240810-163537
To use this TensorboardExperiment in another session:
tb experiment = aiplatform.TensorboardExperiment('projects/255766800726/locations/asia-southeast1/tensorboards/687784105592487936/experiments/llama3-1-lora-train-20240810-163537')
Training Output directory:
gs://my-project-0004-346516-llama31-asia-southeast1/temporal/llama3-1-lora-train-20240810-163537 
V

RuntimeError: Custom Training has already run.

## Deploy
This section uploads the model to Model Registry and deploys it on the Endpoint. It takes 15 minutes to 1 hour to finish.

In [None]:
print("Deploying models in: ", merged_model_output_dir)

# Find Vertex AI prediction supported accelerators and regions in [here](https://cloud.google.com/vertex-ai/docs/predictions/configure-compute).
if "8b" in MODEL_ID.lower():
    machine_type = "g2-standard-8"
    accelerator_type = "NVIDIA_L4"
    accelerator_count = 1
else:
    machine_type = "g2-standard-96"
    accelerator_type = "NVIDIA_L4"
    accelerator_count = 8

common_util.check_quota(
    project_id=PROJECT_ID,
    region=REGION,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    is_for_training=False,
)

gpu_memory_utilization = 0.85
max_model_len = 8192  # Maximum context length.

# Ensure max_model_len does not exceed the limit
if max_model_len > 8192:
    raise ValueError("max_model_len cannot exceed 8192")

models["vllm_gpu"], endpoints["vllm_gpu"] = deploy_model_vllm(
    model_name=common_util.get_job_name_with_datetime(prefix="llama3_1-vllm-serve"),
    model_id=merged_model_output_dir,
    service_account=SERVICE_ACCOUNT,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    gpu_memory_utilization=gpu_memory_utilization,
    max_model_len=max_model_len,
)

## Predict

Once deployment succeeds, you can send requests to the endpoint with text prompts. Sampling parameters supported by vLLM can be found [here](https://docs.vllm.ai/en/latest/dev/sampling_params.html).

Example:

```
Human: What is a car?
Assistant:  A car, or a motor car, is a road-connected human-transportation system used to move people or goods from one place to another. The term also encompasses a wide range of vehicles, including motorboats, trains, and aircrafts. Cars typically have four wheels, a cabin for passengers, and an engine or motor. They have been around since the early 19th century and are now one of the most popular forms of transportation, used for daily commuting, shopping, and other purposes.
```
Additionally, you can moderate the generated text with Vertex AI. See [Moderate text documentation](https://cloud.google.com/natural-language/docs/moderating-text) for more details.

Loads an existing endpoint instance using the endpoint name:
- Using `endpoint_name = endpoint.name` allows us to get the
  endpoint name of the endpoint `endpoint` created in the cell
  above.
- Alternatively, you can set `endpoint_name = "1234567890123456789"` to load
  an existing endpoint with the ID 1234567890123456789.
You may uncomment the code below to load an existing endpoint.

In [None]:
# endpoint_name = ""  # @param {type:"string"}
# aip_endpoint_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
# )
# endpoint = aiplatform.Endpoint(aip_endpoint_name)

In [None]:
prompt = "What is a car?"  # @param {type: "string"}
max_tokens = 50  # @param {type:"integer"}
temperature = 1.0  # @param {type:"number"}
top_p = 1.0  # @param {type:"number"}
top_k = 1  # @param {type:"integer"}
raw_response = False  # @param {type:"boolean"}

Overrides parameters for inferences.
If you encounter the issue like `ServiceUnavailable: 503 Took too long to respond when processing`,
you can reduce the maximum number of output tokens, such as set max_tokens as 20.

In [None]:
instances = [
    {
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "top_p": top_p,
        "top_k": top_k,
        "raw_response": raw_response,
    },
]
response = endpoints["vllm_gpu"].predict(instances=instances)

for prediction in response.predictions:
    print(prediction)


## Clean up resources
### Delete the model and endpoint
Delete the experiment models and endpoints to recycle the resources and avoid unnecessary continouous charges that may incur.

In [None]:
# train_job.delete()

#### Undeploy model and delete endpoint.

In [None]:
# for endpoint in endpoints.values():
#     endpoint.delete(force=True)

#### Delete models.

In [None]:
# for model in models.values():
#     model.delete()

# delete_bucket = False  # @param {type:"boolean"}
# if delete_bucket:
#     ! gsutil -m rm -r $BUCKET_NAME