In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden - LLaMA2 (PEFT Finetuning)

Adapted from: [Llama 2 PEFT Finetuning with Text Data on GitHub](https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_llama2_peft_finetuning.ipynb)

Modified by: [Wan Qi Ang](https://github.com/angwanqi) for 2024 EDB x Google Cloud - Cloud AI Take Off Program

Last updated: 11 Nov 2024

## Overview

This notebook demonstrates downloading [LLaMA2 models](https://huggingface.co/meta-llama), finetuning with parameter efficient finetuning libraries ([PEFT](https://github.com/huggingface/peft)), and deploying the finetuned model on Vertex AI.

### Objective

- Download prebuilt LLaMA2 models.
- Finetune and deploy LLaMA2 models with Vertex AI SDK.

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing) and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Before you begin

### Installation
Run the cell below if this is your first time running the notebook. Else, feel free to skip the cell below as the libraries would have already been installed.

In [None]:
%pip install --upgrade --user --quiet google-cloud-aiplatform
%pip install --quiet --upgrade huggingface_hub

### Restart current runtime
To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which will restart the current kernel.
<div class="alert alert-block alert-success"> 
<b>NOTE:</b> Only restart the current runtime if you installed libraries. If you did not install new libraries, you do not need to restart the kernel.
</div>

In [None]:
# Restart kernel after installs so that your environment can access the new packages
import IPython
import time

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Please wait until it is finished before continuing to the next step. ⚠️</b>
</div>

### Import neccessary libraries

In [None]:
import os
from datetime import datetime
from typing import Tuple
import pandas as pd

from google.cloud import aiplatform

### Set your project ID

In [None]:
# Get the default cloud project id.
PROJECT_ID= !(gcloud config get-value core/project)
PROJECT_ID = PROJECT_ID[0]

### Set variables

In [None]:
# Set the default region for launching jobs.
REGION = "asia-southeast1"

print(f"Project ID:", PROJECT_ID)
print(f"Project Region:", REGION)

### Set (or create) the Google Cloud Storage bucket

<div class="alert alert-block alert-info"> 
<b>INPUT REQUIRED:</b> Replace <YOUR_NAME> with your name so that you'll be able to identify your Google Cloud Storage bucket later on. <b>Example:</b> BUCKET_PREFIX = "john"
</div>

In [None]:
# Update the bucket prefix with your name
PREFIX = "<YOUR_NAME>"

# Concatenate to get the full bucket name
BUCKET_NAME = PREFIX + "-llama2-text-peft"
BUCKET_URI = f"gs://{BUCKET_NAME}"

STAGING_BUCKET = os.path.join(BUCKET_URI, "temporal")
EXPERIMENT_BUCKET = os.path.join(BUCKET_URI, "peft")
MODEL_BUCKET = os.path.join(BUCKET_URI, "llama2")


print(f"ROOT_BUCKET_URI:", BUCKET_URI)
print(f"STAGING_BUCKET_URI:", STAGING_BUCKET)
print(f"EXPERIMENT_BUCKET_URI:", EXPERIMENT_BUCKET)
print(f"MODEL_BUCKET_URI:", MODEL_BUCKET)

In [None]:
# Create Cloud Storage Bucket
!gcloud storage buckets create $BUCKET_URI --location=$REGION

### Setting up the Compute Engine Service Account

#### Retrieve the default Compute Engine Service Account

In [None]:
# Gets the default BUCKET_URI and SERVICE_ACCOUNT if they were not specified by the user.
shell_output = ! gcloud projects describe $PROJECT_ID
project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"
print("Using this default Service Account:", SERVICE_ACCOUNT)

#### Assign Cloud Storage admin IAM role to the Service Account

In [None]:
# Provision permissions to the SERVICE_ACCOUNT with the GCS bucket
BUCKET_NAME = "/".join(BUCKET_URI.split("/")[:3])
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.admin $BUCKET_NAME

### Initialize Vertex AI SDK for Python

Initialize the Vertex AI SDK for Python for your project and corresponding bucket.

In [None]:
# Initialize Vertex AI API.
print("Initializing Vertex AI API.")
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

## Finetuning Llama 2
- The original models from Meta are converted into the Hugging Face format for serving in Vertex AI.
- Accept the model agreement to access the models:
- Open the [LLaMA2 model card](https://console.cloud.google.com/vertex-ai/publishers/google/model-garden/139) from [Vertex AI Model Garden](https://cloud.google.com/model-garden).
- Review and accept the agreement in the pop-up window on the model card page. If you have previously accepted the model agreement, there will not be a pop-up window on the model card page and this step is not needed.
- A Cloud Storage bucket (starting with `gs://`) containing LLaMA 2 pretrained and finetuned models will be shared under the “Documentation” section and its “Get started” subsection. 

### Download Llama 2 Model Artifacts to your project

In [None]:
# Copy Llama 2 model artifaces to your bucket
VERTEX_AI_MODEL_GARDEN_LLAMA2 = "gs://vertex-model-garden-public-us-central1/llama2"

! gsutil -m cp -R $VERTEX_AI_MODEL_GARDEN_LLAMA2/* $MODEL_BUCKET

### Utility Functions

In [None]:
# Utility Functions
def get_job_name_with_datetime(prefix: str) -> str:
    """Gets the job name with date time when triggering training or deployment
    jobs in Vertex AI.
    """
    return prefix + datetime.now().strftime("_%Y%m%d_%H%M%S")

### Setting up the training dataset
This notebook uses [timdettmers/openassistant-guanaco](https://huggingface.co/datasets/timdettmers/openassistant-guanaco) dataset as an example.
You can set `dataset_name` to any existing [Hugging Face dataset](https://huggingface.co/datasets) name, and set `instruct_column_in_dataset` to the name of the dataset column containing training data. The [timdettmers/openassistant-guanaco](https://huggingface.co/datasets/timdettmers/openassistant-guanaco) has only one column `text`, and therefore we set `instruct_column_in_dataset` to `text` in this notebook.

In [None]:
# Hugging Face dataset name or gs:// URI to a custom JSONL dataset.
dataset_name = "timdettmers/openassistant-guanaco"

# Name of the dataset column containing training text input.
instruct_column_in_dataset = "text"

# Optional. Template name or gs:// URI to a custom template.
template = ""

In [None]:
# Load the data and have a look at what's in it
splits = {'train': 'openassistant_best_replies_train.jsonl', 'test': 'openassistant_best_replies_eval.jsonl'}
df = pd.read_json("hf://datasets/timdettmers/openassistant-guanaco/" + splits["train"], lines=True)

# Print the first 2 rows
df.head(2)

In [None]:
# Let's print the first row of data
df["text"][0]

### Finetune with PEFT
This section demonstrates how to finetune the LLaMA 2 models with PEFT LoRA. By default, the model will be finetuned for 500 steps on a batch size of 1 to save GPU resources.
- Finetuning `llama2-7b` models is expected to take around 30 minutes.
- To customize finetuning settings and parameters, click "Show code" to see more details.

Here are the full list of models that you can tune:\
["llama2-7b-hf", "llama2-7b-chat-hf", "llama2-13b-hf", "llama2-13b-chat-hf", "llama2-70b-hf", "llama2-70b-chat-hf"]

#### Set the base model ID that you would like to tune

In [None]:
# Set the base model id.
base_model_id = "llama2-7b-hf" 
model_id = os.path.join(MODEL_BUCKET, base_model_id)

#### Set the accelerator that you would like to use

In [None]:
# Set the accelerator type.
accelerator_type = "NVIDIA_A100_80GB"

machine_type = None
if "7b" in model_id:
    if accelerator_type == "NVIDIA_TESLA_A100":
        machine_type = "a2-highgpu-1g"
        accelerator_count = 1
    elif accelerator_type == "NVIDIA_TESLA_V100":
        machine_type = "n1-standard-16"
        accelerator_count = 2
    elif accelerator_type == "NVIDIA_L4":
        machine_type = "g2-standard-48"
        accelerator_count = 4
    elif accelerator_type == "NVIDIA_A100_80GB":
        machine_type = "a2-ultragpu-1g"
        accelerator_count = 1 
elif "13b" in model_id:
    if accelerator_type == "NVIDIA_TESLA_A100":
        machine_type = "a2-highgpu-1g"
        accelerator_count = 1
    elif accelerator_type == "NVIDIA_TESLA_V100":
        machine_type = "n1-standard-32"
        accelerator_count = 4
    elif accelerator_type == "NVIDIA_L4":
        machine_type = "g2-standard-24"
        accelerator_count = 2
    elif accelerator_type == "NVIDIA_A100_80GB":
        machine_type = "a2-ultragpu-1g"
        accelerator_count = 1
elif "70b" in model_id:
    if accelerator_type == "NVIDIA_TESLA_A100":
        machine_type = "a2-highgpu-4g"
        accelerator_count = 4
    elif accelerator_type == "NVIDIA_L4":
        machine_type = "g2-standard-96"
        accelerator_count = 8

if machine_type is None:
    raise ValueError(
        f"Recommended machine settings not found for: {accelerator_type}. To use another another accelerator, please edit this code block to set an appropriate `machine_type`, `accelerator_type`, and `accelerator_count` in worker_pool_specs."
    )

#### Set up the training parameters

In [None]:
job_name = get_job_name_with_datetime(PREFIX + "-llama2-train")
output_dir = os.path.join(EXPERIMENT_BUCKET, job_name)
merge_job_name = get_job_name_with_datetime(PREFIX + "-llama2-merge")
merged_model_output_dir = os.path.join(EXPERIMENT_BUCKET, merge_job_name)
finetune_precision_mode = "float16"

# The pre-built training docker images.
TRAIN_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-peft-train:latest"

replica_count = 1

# Runs 500 training steps.
max_steps = 500
per_device_train_batch_size = 1
# LoRA parameters.
lora_rank = 16
lora_alpha = 32
lora_dropout = 0.05

flags = {
    "learning_rate": 2e-4,
    "precision_mode": finetune_precision_mode,
    "task": "instruct-lora",
    "per_device_train_batch_size": per_device_train_batch_size,
    "dataset_name": dataset_name,
    "instruct_column_in_dataset": instruct_column_in_dataset,
    "template": template,
    "pretrained_model_id": model_id,
    "output_dir": output_dir,
    "merge_base_and_lora_output_dir": merged_model_output_dir,
    "warmup_steps": 10,
    "max_steps": max_steps,
    "lora_rank": lora_rank,
    "lora_alpha": lora_alpha,
    "lora_dropout": lora_dropout,
}

train_job = aiplatform.CustomJob(
    display_name=job_name,
    worker_pool_specs=[
        {
            "machine_spec": {
                "machine_type": machine_type,
                "accelerator_type": accelerator_type,
                "accelerator_count": accelerator_count,
            },
            "replica_count": replica_count,
            "container_spec": {
                "image_uri": TRAIN_DOCKER_URI,
                "args": ["--{}={}".format(k, v) for k, v in flags.items()],
            },
        }
    ],
    staging_bucket=STAGING_BUCKET,
)

#### Start the training job

<div class="alert alert-block alert-warning">
<b>⚠️ It will take ~45mins for the model tuning job to complete on the provided dataset and set configurations/hyperparameters. ⚠️</b>
</div>

In [None]:
train_job.run()

print("The finetuned models of different trials can be found at: ", output_dir)
print("The finetuned model merged with the base model can be found at: ",merged_model_output_dir,)

## Deploying your finetuned model
This section uploads the model to Model Registry and deploys it on the Endpoint. It takes 15 minutes to 1 hour to finish.

### Utility Functions

In [None]:
# Utility Functions
def deploy_model_vllm(
    model_name: str,
    model_id: str,
    service_account: str,
    machine_type: str = "g2-standard-8",
    accelerator_type: str = "NVIDIA_L4",
    accelerator_count: int = 1,
    max_model_len: int = 4096,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys trained models with vLLM into Vertex AI."""
    endpoint = aiplatform.Endpoint.create(display_name=f"{model_name}-endpoint")

    vllm_args = [
        "--host=0.0.0.0",
        "--port=7080",
        f"--tensor-parallel-size={accelerator_count}",
        "--swap-space=16",
        "--gpu-memory-utilization=0.8",
        f"--max-model-len={max_model_len}",
        "--max-num-batched-tokens=4096",
        "--disable-log-stats",
    ]

    env_vars = {"MODEL_ID": model_id, "DEPLOY_SOURCE": "notebook"}
    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=VLLM_DOCKER_URI,
        serving_container_command=["python", "-m", "vllm.entrypoints.api_server"],
        serving_container_args=vllm_args,
        serving_container_ports=[7080],
        serving_container_predict_route="/generate",
        serving_container_health_route="/ping",
        serving_container_environment_variables=env_vars,
        artifact_uri=model_id,
    )
    print(
        f"Deploying {model_name} on {machine_type} with {accelerator_count} {accelerator_type} GPU(s)."
    )
    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=1800,
        service_account=service_account,
    )
    print("endpoint_name:", endpoint.name)

    print("To load this existing endpoint from a different session:")
    print("from google.cloud import aiplatform")
    print(
        f'endpoint = aiplatform.Endpoint("projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint.name}")'
    )
    return model, endpoint

### Deploy the model

In [None]:
# Initialize Vertex AI API.
# us-central1, asia-southeast1, europe-west4
REGION = "<SET_REGION>"

print("Initializing Vertex AI API.")
aiplatform.init(project=PROJECT_ID, location=REGION)

In [None]:
# The pre-built serving and training docker images.
VLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20240326_0916_RC00"

print("Deploying models in: ", merged_model_output_dir)

# The max_model_len must not exceed the model's context length.
# A larger max_model_len will require more GPU memory.
max_model_len = 2048
machine_type = None
accelerator_type = "NVIDIA_A100_80GB"
# accelerator_type = "NVIDIA_L4"

if "7b" in model_id:
    if accelerator_type == "NVIDIA_TESLA_A100":
        machine_type = "a2-highgpu-1g"
        accelerator_count = 1
    elif accelerator_type == "NVIDIA_TESLA_V100":
        machine_type = "n1-standard-8"
        accelerator_count = 1
    elif accelerator_type == "NVIDIA_L4":
        machine_type = "g2-standard-48"
        accelerator_count = 4
    elif accelerator_type == "NVIDIA_A100_80GB":
        machine_type = "a2-ultragpu-1g"
        accelerator_count = 1
elif "13b" in model_id:
    if accelerator_type == "NVIDIA_TESLA_A100":
        machine_type = "a2-highgpu-1g"
        accelerator_count = 1
    elif accelerator_type == "NVIDIA_TESLA_V100":
        machine_type = "n1-standard-16"
        accelerator_count = 2
    elif accelerator_type == "NVIDIA_L4":
        machine_type = "g2-standard-24"
        accelerator_count = 2
elif "70b" in model_id:
    if accelerator_type == "NVIDIA_TESLA_A100":
        machine_type = "a2-highgpu-4g"
        accelerator_count = 4
    elif accelerator_type == "NVIDIA_L4":
        machine_type = "g2-standard-96"
        accelerator_count = 8

if machine_type is None:
    raise ValueError(
        f"Recommended machine settings not found for: {accelerator_type}. To use another another accelerator, please edit this code block to pass in an appropriate `machine_type`, `accelerator_type`, and `accelerator_count` to the deploy_model_vllm function."
    )

model, endpoint = deploy_model_vllm(
    model_name=get_job_name_with_datetime(prefix="llama-vllm-serve"),
    model_id=merged_model_output_dir,
    service_account=SERVICE_ACCOUNT,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    max_model_len=max_model_len,
)
print("endpoint_name:", endpoint.name)

## Testing the endpoint with a prediction
Once deployment succeeds, you can send requests to the endpoint with text prompts.

Here we use an example from the [timdettmers/openassistant-guanaco](https://huggingface.co/datasets/timdettmers/openassistant-guanaco) to show the finetuning outcome:

<b>Human:</b> How would the Future of AI in 10 Years look?\
<b>Assistant:</b> Predicting the future is always a challenging task, but here are some possible ways that AI could evolve over the next 10 years:
- <b>Continued advancements in deep learning:</b> Deep learning has been one of the main drivers of recent AI breakthroughs, and we can expect continued advancements in this area. This may include improvements to existing algorithms, as well as the development of new architectures that are better suited to specific types of data and tasks.
- <b>Increased use of AI in healthcare:</b> AI has the potential to revolutionize healthcare, by improving the accuracy of diagnoses, developing new treatments, and personalizing patient care. We can expect to see continued investment in this area, with more healthcare providers and researchers using AI to improve patient outcomes.
- <b>Greater automation in the workplace:</b> Automation is already transforming many industries, and AI is likely to play an increasingly important role in this process. We can expect to see more jobs being automated, as well as the development of new types of jobs that require a combination of human and machine skills.
- <b>More natural and intuitive interactions with technology:</b> As AI becomes more advanced, we can expect to see more natural and intuitive ways of interacting with technology. This may include voice and gesture recognition, as well as more sophisticated chatbots and virtual assistants.
- <b>Increased focus on ethical considerations:</b> As AI becomes more powerful, there will be a growing need to consider its ethical implications. This may include issues such as bias in AI algorithms, the impact of automation on employment, and the use of AI in surveillance and policing. 

Overall, the future of AI in 10 years is likely to be shaped by a combination of technological advancements, societal changes, and ethical considerations. While there are many exciting possibilities for AI in the future, it will be important to carefully consider its potential impact on society and to work towards ensuring that its benefits are shared fairly and equitably.`

In [None]:
# You may uncomment the code below to load an existing endpoint.

# # Retrieve the Vertex AI Prediction Endpoint IDs and set it
# endpoint = ""
# check_regions = ["us-central1", "asia-southeast1", "europe-west4"]

# for region in check_regions:
#     all_endpoints = aiplatform.Endpoint.list(location=region)
#     for endpoint in all_endpoints:
#         full_endpoint = f"projects/{PROJECT_ID}/locations/{region}/endpoints/{endpoint.name}"
        
#         if "llama-vllm-serve" in endpoint.display_name:
#             endpoint = aiplatform.Endpoint(full_endpoint)

In [None]:
prompt = "How would the Future of AI in 10 Years look?"
max_tokens = 128
temperature = 1.0
top_p = 0.9
top_k = 1

# Overides max_tokens and top_k parameters during inferences.
# If you encounter the issue like `ServiceUnavailable: 503 Took too long to respond when processing`,
# you can reduce the max length, such as set max_tokens as 20.
instances = [
    {
        "prompt": f"### Human: {prompt}### Assistant: ",
        "max_tokens": max_tokens,
        "temperature": temperature,
        "top_p": top_p,
        "top_k": top_k,
    },
]
response = endpoint.predict(instances=instances)

for prediction in response.predictions:
    print(prediction)

## Clean up resources
Delete the experiment models and endpoints to recycle the resources and avoid unnecessary continuous charges that may incur.

In [None]:
# if train_job._gca_resource.name:
#     # Training job is submitted.
#     train_job.delete()

# # Undeploy model and delete endpoint.
# endpoint.delete(force=True)

# # Delete model.
# model.delete()

# # Delete Cloud Storage objects that were created.
# delete_bucket = False  # @param {type:"boolean"}
# if delete_bucket:
#     ! gsutil -m rm -r $BUCKET_URI