In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden - Llama 3 70B

<table><tbody><tr>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fcommunity%2Fmodel_garden%2Fmodel_garden_pytorch_llama3_deployment.ipynb">
      <img alt="Google Cloud Colab Enterprise logo" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" width="32px"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_llama3_deployment.ipynb">
      <img alt="GitHub logo" src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" width="32px"><br> View on GitHub
    </a>
  </td>
      <td style="text-align: center">
  <a href="https://console.cloud.google.com/vertex-ai/notebooks/deploy-notebook?download_url=https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_llama3_deployment.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
Open in Vertex AI Workbench
    </a> (A Python-3 CPU notebook is recommended)
    </td>
</tr></tbody></table>



## Overview

This notebook demonstrates downloading and deploying prebuilt [LLaMA3 models](https://huggingface.co/meta-llama), and demonstrates deploying prebuilt LLaMA3 models with [vLLM](https://github.com/vllm-project/vllm) to improve serving throughput.


### Objective

- Download and deploy prebuilt LLaMA3 models
- Deploy LLaMA3 with [vLLM](https://github.com/vllm-project/vllm) to improve serving throughput

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Before you begin

### Setup Google Cloud project

1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.

1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

1. [Enable the Vertex AI API, Compute Engine API and Cloud Natural Language API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com,compute_component,language.googleapis.com).

1. [Create a Cloud Storage bucket](https://cloud.google.com/storage/docs/creating-buckets) for storing experiment outputs. (Optional)

1. [Create a service account](https://cloud.google.com/iam/docs/service-accounts-create#iam-service-accounts-create-console) with `Vertex AI User` and `Storage Object Admin` roles for deploying fine tuned model to Vertex AI endpoint.

### Define environment variables

Set the following variables for the experiment environment. The specified Cloud Storage bucket (`BUCKET_URI`) should be located in the specified region (`REGION`). Note that a multi-region bucket (eg. "us") is not considered a match for a single region covered by the multi-region range (eg. "us-central1").

Auto allocation of REGION

In [None]:
# The region you want to launch jobs in.
# Select region based on the accelerators and regions supported by Vertex AI Prediction
# https://cloud.google.com/vertex-ai/docs/predictions/configure-compute.

import random
# The region you want to launch jobs in.
REGION_ALLOCATE=random.randint(0,2)
if REGION_ALLOCATE == 0:
    REGION = "asia-northeast3"
elif REGION_ALLOCATE == 1:
    REGION = "us-west4"
else:
    REGION = "europe-west1"
    
print(f"Region allocated: {REGION}")

In [None]:
import socket
import re

UNIQUE_PREFIX = socket.gethostname()
UNIQUE_PREFIX = re.sub('[^A-Za-z0-9]+', '', UNIQUE_PREFIX)

# Cloud project id.
PROJECT_IDS = !(gcloud config get-value core/project)
PROJECT_ID = PROJECT_IDS[0]  # @param {type:"string"}

# The Cloud Storage bucket for storing experiments output.
# Remove prefix gs://, e.g. foo_bucket.
BUCKET_NAME = f"{PROJECT_ID}-{UNIQUE_PREFIX}-{REGION}"

# The service account looks like:
# '@.iam.gserviceaccount.com'
# Please go to https://cloud.google.com/iam/docs/service-accounts-create#iam-service-accounts-create-console
# and create service account with `Vertex AI User` and `Storage Object Admin` roles.
# The service account for deploying fine tuned model.
SERVICE_ACCOUNT = !(gcloud config get-value core/account)  # @param {type:"string"}
SERVICE_ACCOUNT = SERVICE_ACCOUNT[0]  # @param {type:"string"}

BUCKET_URI = f"gs://{BUCKET_NAME}"  # @param {type:"string"}

! gcloud config set project $PROJECT_ID
! gcloud storage buckets create {BUCKET_URI} --project={PROJECT_ID} --location={REGION}

import os

STAGING_BUCKET = os.path.join(BUCKET_URI, "temporal")

In [None]:
MODEL_BUCKET = os.path.join(BUCKET_URI, "llama3")
!gsutil -m cp -R gs://vertex-model-garden-public-us/llama3 {MODEL_BUCKET}

### Initialize Vertex AI API

In [None]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

### Define common functions

In [None]:
import os
from datetime import datetime
from typing import Tuple

from google.cloud import aiplatform


def get_job_name_with_datetime(prefix: str) -> str:
    """Gets the job name with date time when triggering training or deployment
    jobs in Vertex AI.
    """
    return prefix + datetime.now().strftime("_%Y%m%d_%H%M%S")


def deploy_model_vllm(
    model_name: str,
    model_id: str,
    service_account: str,
    machine_type: str = "g2-standard-8",
    accelerator_type: str = "NVIDIA_L4",
    accelerator_count: int = 1,
    gpu_memory_utilization: float = 0.9,
    max_model_len: int = 4096,
    use_openai_server: bool = False,
    use_chat_completions_if_openai_server: bool = False,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys Mistral models with vLLM on Vertex AI.

    Args:
        model_name: Display name of the model.
        model_id: Model ID or path to model weights.
        service_account: Service account for model uploading and deployment.
        machine_type: Deployment machine type.
        accelerator_type: Deployment accelerator type.
        accelerator_count: Number of accelerators to use.
        max_model_len: Maximum model length.
        gpu_memory_utilization: Fraction of GPU memory to be used for the model
        executor.
        use_openai_server: Whether to use the OpenAI-format vLLM model server.
        use_chat_completions_if_openai_server: If the OpenAI model server is
            used, whether to use the chat completion API as opposed to the text
            completion API. The vLLM text completion API mimics the OpenAI text
            completion API:
            https://platform.openai.com/docs/api-reference/completions/create.
            It has two required parameters: the model ID to direct requests to
            and the prompt. The response includes a "choices" field that
            contains the generated text and a "usage" field that contains token
            counts. The vLLM chat completion API mimics the OpenAI chat
            completion API:
            https://platform.openai.com/docs/api-reference/chat/create. It has
            two required parameters: the model ID to direct requests to and
            "messages" which is a sequence of system/user/assistant/tool
            messages that can represent a multi-turn chat conversation. The
            response includes a "choices" field that contains the generated
            message from a role and a "usage" field that contains token counts.

    Returns:
        Model instance and endpoint instance.
    """
    endpoint = aiplatform.Endpoint.create(display_name=f"{model_name}-endpoint")

    if "llama" in model_id:
        VLLM_DOCKER_URI="us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20240418_0936_RC01"
    else:
        VLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20240313_0916_RC00"
    
    dtype = "bfloat16"
    if accelerator_type in ["NVIDIA_TESLA_T4", "NVIDIA_TESLA_V100"]:
        dtype = "float16"

    vllm_args = [
        "--host=0.0.0.0",
        "--port=7080",
        f"--model={model_id}",
        f"--tensor-parallel-size={accelerator_count}",
        "--swap-space=16",
        f"--dtype={dtype}",
        f"--gpu-memory-utilization={gpu_memory_utilization}",
        f"--max-model-len={max_model_len}",
        "--disable-log-stats",
    ]
    
    serving_env = {
            "MODEL_ID": model_id,
        }
    
    if use_openai_server:
        if use_chat_completions_if_openai_server:
            serving_container_predict_route = "/v1/chat/completions"
        else:
            serving_container_predict_route = "/v1/completions"
    else:
        serving_container_predict_route = "/generate"
    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=VLLM_DOCKER_URI,
        serving_container_command=[
            "python",
            "-m",
            (
                "vllm.entrypoints.openai.api_server"
                if use_openai_server
                else "vllm.entrypoints.api_server"
            ),
        ],
        serving_container_args=vllm_args,
        serving_container_ports=[7080],
        serving_container_predict_route=serving_container_predict_route,
        serving_container_health_route="/health",
        serving_container_environment_variables=serving_env,
    )

    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=1800,
        service_account=service_account,
    )
    return model, endpoint

## Deploy Prebuilt LLAMA 3 model with vLLM
This section uploads prebuilt LLaMA3 models to Model Registry and deploys it to a Vertex AI Endpoint. It takes 15 minutes to 1 hour to finish depending on the size of the model.

NVIDIA_L4 GPUs are used for demonstration. The serving efficiency of L4 GPUs is inferior to that of A100 GPUs, but L4 GPUs are nevertheless good serving solutions if you do not have A100 quota.

In [None]:
base_model_name = "llama3-70b-hf"  # @param ["llama3-8b-hf", "llama3-8b-chat-hf", "llama3-70b-hf", "llama3-70b-chat-hf"] {isTemplate:true}
model_id = os.path.join(MODEL_BUCKET, base_model_name)

Find Vertex AI prediction supported accelerators and regions at https://cloud.google.com/vertex-ai/docs/predictions/configure-compute.

In [None]:
accelerator_type = "NVIDIA_L4"  # @param ["NVIDIA_L4", "NVIDIA_TESLA_A100"]
gpu_memory_utilization = 0.85
max_model_len = 8192  # Maximum context length.

def deploy_model_with_config(machine_type, accelerator_type, accelerator_count ):
        model, endpoint = deploy_model_vllm(
        model_name=get_job_name_with_datetime(prefix="llama3-serve"),
        model_id=model_id,
        service_account=SERVICE_ACCOUNT,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        gpu_memory_utilization=gpu_memory_utilization,
        max_model_len=max_model_len,
        )
    
        return(model, endpoint)

if "8b" in base_model_name:
    try:
        # L4 serving is more cost efficient than V100 serving.
        llama_model, llama_endpoint = deploy_model_with_config(
            machine_type = "g2-standard-8", 
            accelerator_type = "NVIDIA_L4", 
            accelerator_count = 1
        )
    except Exception as e:
        print(f"Error: {e} ")
        try:
            llama_model, llama_endpoint = deploy_model_with_config(
                machine_type = "a2-highgpu-1g", 
                accelerator_type = "NVIDIA_TESLA_A100", 
                accelerator_count = 1
            )
        except Exception as e:
            print(f"Error: {e} ")
            print(f"Recommended GPU setting not found for: {accelerator_type} and {base_model_name}.")
elif "70b" in base_model_name:
    # If you do not have access to 4 A100 (40G) GPUs, you may serve LLaMA3 70B
    # models with 8 L4 (24G) GPUs.
    # Note that with the default timeout threshold of Vertex endpoints, you should
    # set a `max_tokens` configuration of around 1,000 tokens or fewer. If you need
    # longer generated sequences, please file a request with Vertex to allowlist
    # your project for a longer timeout threshold with Vertex endpoints.
    try: 
        llama_model, llama_endpoint = deploy_model_with_config(
                machine_type = "g2-standard-96", 
                accelerator_type = "NVIDIA_L4", 
                accelerator_count = 8
        )
    except Exception as e:
        print(f"Error: {e} ")
        try:
            llama_model, llama_endpoint = deploy_model_with_config(
                machine_type = "a2-highgpu-4g", 
                accelerator_type = "NVIDIA_TESLA_A100", 
                accelerator_count = 4
            )
        except Exception as e:
            print(f"Error: {e} ")
            print(f"Recommended GPU setting not found for: {accelerator_type} and {base_model_name}.")
else:
    print(f"Recommended GPU setting not found for: {accelerator_type} and {base_model_name}.")

### Predict
Once deployment succeeds, you can send requests to the endpoint with text prompts. Sampling parameters supported by vLLM can be found [here](https://github.com/vllm-project/vllm/blob/2e8e49fce3775e7704d413b2f02da6d7c99525c9/vllm/sampling_params.py#L23-L64).
Example:
```
Human: What is a car?
Assistant:  A car, or a motor car, is a road-connected human-transportation system used to move people or goods from one place to another. The term also encompasses a wide range of vehicles, including motorboats, trains, and aircrafts. Cars typically have four wheels, a cabin for passengers, and an engine or motor. They have been around since the early 19th century and are now one of the most popular forms of transportation, used for daily commuting, shopping, and other purposes.
```
Additionally, you can moderate the generated text with Vertex AI. See [Moderate text documentation](https://cloud.google.com/natural-language/docs/moderating-text) for more details.

Set the prebuilt model id.

NOTE: If you see a `ServiceUnavailable: 503 502:Bad Gateway` error when you send requests to the endpoint, the model server is likely still initializing. Please retry later.

NOTE: If you receive `InternalServerError: 500 System error` during the deployment, most likely the operation failed due to unavailability of resources. Either retry or use a different accelerator type.

Once deployment succeeds, you can send requests to the endpoint with text prompts.

In [None]:
prompt = "What is an apple?"  # @param {type: "string"}: Text prompt that Mistral will generate
max_tokens = 50  # @param {type:"integer"}:  Word limit of text generated
temperature = 1.0  # @param {type:"number"}: Randomness of the text
top_p = 1.0  # @param {type:"number"}: probability distribution
top_k = 1  # @param {type:"integer"}: number of continuations to consider at each step when building the sequence
raw_response = False  # @param {type:"boolean"}: whether to use raw response

# Overides parameters for inferences.
# If you encounter the issue like `ServiceUnavailable: 503 Took too long to respond when processing`,
# you can reduce the maximum number of output tokens, such as set max_tokens as 20.
instances = [
    {
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "top_p": top_p,
        "top_k": top_k,
        "raw_response": raw_response,
    },
]
response = llama_endpoint.predict(instances=instances)

for prediction in response.predictions:
    print(prediction)

## Cleaning up
Delete the experiment models and endpoints to recycle the resources and avoid unnecessary continouous charges that may incur.

### Undeploy models and Delete endpoints

Set this flag to delete endpoint including undeploying models

After running the cell below, do check in the [console](https://console.cloud.google.com/vertex-ai/online-prediction/endpoints) that all endpoints have been deleted


In [None]:
# # Undeploy model and delete endpoint.
# llama_endpoint.delete(force=True)

# # Delete models
# llama_model.delete()

# # Delete buckets
# delete_bucket = True  # @param {type:"boolean"}
# if delete_bucket:
#     ! gsutil -m rm -r $BUCKET_URI