In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden - Mistral and Mixtral 8x7B Models

<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_mistral.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_mistral.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/notebooks/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/community/model_garden/model_garden_pytorch_mistral.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
Open in Vertex AI Workbench
    </a> (A Python-3 CPU notebook is recommended)
  </td>
</table>

## Overview

This notebook demonstrates deploying prebuilt [Mistral](https://mistral.ai/) and Mixtral 8x7B models in Vertex AI.

### Objective

- Deploy prebuilt [Mistral models](https://huggingface.co/mistralai) with [vLLM](https://github.com/vllm-project/vllm) containers
    - [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1): pretrained generative text model with 7 billion parameters
    - [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1): Instruction fine-tuned version of the Mistral-7B-v0.1 generative text model
- Deploy prebuit [Mixtral 8x7B model](https://huggingface.co/mistralai) with [vLLM](https://github.com/vllm-project/vllm) containers
    - [mistralai/Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1): pretrained Mixture of Experts (MoE) model with 8 branches
    - [mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1): Instruction fine-tuned version of the Mixture of Experts (MoE) model with 8 branches

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), [Cloud NL API pricing](https://cloud.google.com/natural-language/pricing) and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Before you begin

### Colab only
Run the following commands for Colab and skip this section if you are using Workbench.

In [1]:
import sys

if "google.colab" in sys.modules:
    ! pip3 install --upgrade google-cloud-aiplatform
    from google.colab import auth as google_auth

    google_auth.authenticate_user()
    # Install gdown for downloading example training images.
    ! pip3 install gdown

    # Restart the notebook kernel after installs.
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

Collecting google-cloud-aiplatform
  Downloading google_cloud_aiplatform-1.43.0-py2.py3-none-any.whl (4.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.1/4.1 MB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: google-cloud-aiplatform
  Attempting uninstall: google-cloud-aiplatform
    Found existing installation: google-cloud-aiplatform 1.42.1
    Uninstalling google-cloud-aiplatform-1.42.1:
      Successfully uninstalled google-cloud-aiplatform-1.42.1
Successfully installed google-cloud-aiplatform-1.43.0




### Install dependencies

In [None]:
! pip3 install transformers==4.36.0
! pip3 install accelerate==0.23.0

Collecting transformers==4.36.0
  Downloading transformers-4.36.0-py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.37.2
    Uninstalling transformers-4.37.2:
[31mERROR: Operation cancelled by user[0m[31m
[0mCollecting accelerate==0.23.0
  Downloading accelerate-0.23.0-py3-none-any.whl (258 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m


### Setup Google Cloud project

1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.

1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

1. [Enable the Vertex AI API, Compute Engine API and Cloud Natural Language API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com,compute_component,language.googleapis.com).

1. [Create a Cloud Storage bucket](https://cloud.google.com/storage/docs/creating-buckets) for storing experiment outputs.

1. [Create a service account](https://cloud.google.com/iam/docs/service-accounts-create#iam-service-accounts-create-console) with `Vertex AI User` and `Storage Object Admin` roles for deploying fine tuned model to Vertex AI endpoint.

### Define environment variables

Set the following variables for the experiment environment. The specified Cloud Storage bucket (`BUCKET_URI`) should be located in the specified region (`REGION`). Note that a multi-region bucket (eg. "us") is not considered a match for a single region covered by the multi-region range (eg. "us-central1").

In [1]:
# Cloud project id.
PROJECT_ID = "my-project-0004-346516"  # @param {type:"string"}


VERTEX_API_PROJECT = "my-project-0004-346516" #@param {"type": "string"}
REGION = 'us-central1' #@param {"type": "string"}
VERTEX_API_LOCATION = REGION

BUCKET_NAME = 'my-project-0004-bucket' # @param {type:"string"}

# The Cloud Storage bucket for storing experiments output with gs:// prefix.
BUCKET_URI = "gs://my-project-0004-bucket"  # @param {type:"string"}

! gcloud config set project $PROJECT_ID

import os

STAGING_BUCKET = os.path.join(BUCKET_URI, "temporal")

# The service account looks like:
# '@.iam.gserviceaccount.com'
# Please go to https://cloud.google.com/iam/docs/service-accounts-create#iam-service-accounts-create-console
# and create service account with `Vertex AI User` and `Storage Object Admin` roles.
# The service account for deploying fine tuned model.
SERVICE_ACCOUNT = "255766800726-compute@developer.gserviceaccount.com"  # @param {type:"string"}

Updated property [core/project].


### Initialize Vertex AI API

In [2]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

### Define constants

In [None]:
# The pre-built serving docker images with vLLM
VLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20240112_0916_RC00"

### Define common functions

In [4]:
import os
from datetime import datetime
from typing import Tuple

from google.cloud import aiplatform


def get_job_name_with_datetime(prefix: str) -> str:
    """Gets the job name with date time when triggering training or deployment
    jobs in Vertex AI.
    """
    return prefix + datetime.now().strftime("_%Y%m%d_%H%M%S")


def deploy_model_vllm(
    model_name: str,
    model_id: str,
    service_account: str,
    machine_type: str = "g2-standard-8",
    accelerator_type: str = "NVIDIA_L4",
    accelerator_count: int = 1,
    max_model_len: int = 4096,
    use_openai_server: bool = False,
    use_chat_completions_if_openai_server: bool = False,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys Mistral models with vLLM on Vertex AI.

    Args:
        model_name: Display name of the model.
        model_id: Model ID or path to model weights.
        service_account: Service account for model uploading and deployment.
        machine_type: Deployment machine type.
        accelerator_type: Deployment accelerator type.
        accelerator_count: Number of accelerators to use.
        max_model_len: Maximum model length.
        use_openai_server: Whether to use the OpenAI-format vLLM model server.
        use_chat_completions_if_openai_server: If the OpenAI model server is
            used, whether to use the chat completion API as opposed to the text
            completion API. The vLLM text completion API mimics the OpenAI text
            completion API:
            https://platform.openai.com/docs/api-reference/completions/create.
            It has two required parameters: the model ID to direct requests to
            and the prompt. The response includes a "choices" field that
            contains the generated text and a "usage" field that contains token
            counts. The vLLM chat completion API mimics the OpenAI chat
            completion API:
            https://platform.openai.com/docs/api-reference/chat/create. It has
            two required parameters: the model ID to direct requests to and
            "messages" which is a sequence of system/user/assistant/tool
            messages that can represent a multi-turn chat conversation. The
            response includes a "choices" field that contains the generated
            message from a role and a "usage" field that contains token counts.

    Returns:
        Model instance and endpoint instance.
    """
    endpoint = aiplatform.Endpoint.create(display_name=f"{model_name}-endpoint")

    dtype = "bfloat16"
    if accelerator_type in ["NVIDIA_TESLA_T4", "NVIDIA_TESLA_V100"]:
        dtype = "float16"

    vllm_args = [
        "--host=0.0.0.0",
        "--port=7080",
        f"--model={model_id}",
        f"--tensor-parallel-size={accelerator_count}",
        "--swap-space=16",
        f"--dtype={dtype}",
        "--gpu-memory-utilization=0.9",
        f"--max-model-len={max_model_len}",
        "--disable-log-stats",
    ]
    serving_env = {
        "MODEL_ID": "mistralai/Mistral-7B-v0.1",
    }
    if use_openai_server:
        if use_chat_completions_if_openai_server:
            serving_container_predict_route = "/v1/chat/completions"
        else:
            serving_container_predict_route = "/v1/completions"
    else:
        serving_container_predict_route = "/generate"
    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=VLLM_DOCKER_URI,
        serving_container_command=[
            "python",
            "-m",
            (
                "vllm.entrypoints.api_server"
                if not use_openai_server
                else "vllm.entrypoints.openai.api_server"
            ),
        ],
        serving_container_args=vllm_args,
        serving_container_ports=[7080],
        serving_container_predict_route=serving_container_predict_route,
        serving_container_health_route="/health",
        serving_container_environment_variables=serving_env,
    )

    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=1800,
        service_account=service_account,
    )
    return model, endpoint

## Deploy Prebuilt Mistral model with vLLM

This section deploys the prebuilt Mistral model with [vLLM](https://github.com/vllm-project/vllm) on a Vertex endpoint. The model deployment step will take ~15 minutes to complete.

vLLM is a highly optimized LLM serving framework which can significantly increase serving throughput. The higher QPS you have, the more benefits you get using vLLM.

Set the prebuilt model id.

In [10]:
prebuilt_model_id = 'aisingapore/sea-lion-7b-instruct' # "aisingapore/sealion3b"  #
## @param ["mistralai/Mistral-7B-v0.1", "mistralai/Mistral-7B-Instruct-v0.1"]


In [11]:
# Find Vertex AI prediction supported accelerators and regions in
# https://cloud.google.com/vertex-ai/docs/predictions/configure-compute.

# Sets 1 L4 to deploy Mistral 7B.
machine_type = "g2-standard-8"
accelerator_type = "NVIDIA_L4"
accelerator_count = 1

# Sets 2 V100s to deploy Mistral 7B.
# machine_type = "n1-standard-16"
# accelerator_type = "NVIDIA_TESLA_V100"
# accelerator_count = 2

# Sets 2 T4s to deploy Mistral 7B.
# machine_type = "n1-standard-16"
# accelerator_type = "NVIDIA_TESLA_T4"
# accelerator_count = 2

# Sets 1 A100 (40G) to deploy Mistral 7B.
# machine_type = "a2-highgpu-1g"
# accelerator_type = "NVIDIA_TESLA_A100"
# accelerator_count = 1

# Larger setting of `max-model-len` can lead to higher requirements on
# `gpu-memory-utilization` and GPU configuration.
max_model_len = 4096

def deploy_model_with_config(machine_type, accelerator_type, accelerator_count ):

    model, endpoint = deploy_model_vllm(
    model_name=get_job_name_with_datetime(prefix="mistral-serve-vllm"),
    model_id=prebuilt_model_id,
    service_account=SERVICE_ACCOUNT,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    max_model_len=max_model_len,
    use_openai_server=False,
    use_chat_completions_if_openai_server=False,
)

    return(model, endpoint)

try:
    # Code that might potentially cause an error
    machine_type = "g2-standard-8"
    accelerator_type = "NVIDIA_L4"
    accelerator_count = 1
    model, endpoint = deploy_model_with_config(machine_type, accelerator_type, accelerator_count )
except :  # Replace 'ErrorType' with the specific error you want to catch
    try :
        print("Error may becaused due to machine unavailable ")
        machine_type = "a2-highgpu-1g"
        accelerator_type = "NVIDIA_TESLA_A100"
        accelerator_count = 1
        model, endpoint = deploy_model_with_config(machine_type, accelerator_type, accelerator_count )
    except :
        print("Error may becaused due to machine unavailable - A100s - trying T4s ")
        machine_type = "n1-standard-16"
        accelerator_type = "NVIDIA_TESLA_T4"
        accelerator_count = 2
        model, endpoint = deploy_model_with_config(machine_type, accelerator_type, accelerator_count )
else:
    # Code to execute if there's no error in the 'try' block
    print("Error may becaused due to machine unavailable of any type ")


INFO:google.cloud.aiplatform.models:Creating Endpoint
INFO:google.cloud.aiplatform.models:Create Endpoint backing LRO: projects/255766800726/locations/us-central1/endpoints/2311677017910673408/operations/4869671063081975808
INFO:google.cloud.aiplatform.models:Endpoint created. Resource name: projects/255766800726/locations/us-central1/endpoints/2311677017910673408
INFO:google.cloud.aiplatform.models:To use this Endpoint in another session:
INFO:google.cloud.aiplatform.models:endpoint = aiplatform.Endpoint('projects/255766800726/locations/us-central1/endpoints/2311677017910673408')
INFO:google.cloud.aiplatform.models:Creating Model
INFO:google.cloud.aiplatform.models:Create Model backing LRO: projects/255766800726/locations/us-central1/models/8340664310866903040/operations/156654053038751744
INFO:google.cloud.aiplatform.models:Model created. Resource name: projects/255766800726/locations/us-central1/models/8340664310866903040@1
INFO:google.cloud.aiplatform.models:To use this Model in an

FailedPrecondition: 400 Model server terminated: model server container terminated: go/nodeserialize   
exit_code: 1
reason: "Error"
started_at {
  seconds: 1709780000
}
finished_at {
  seconds: 1709780011
}
. Model server logs can be found at https://console.cloud.google.com/logs/viewer?project=255766800726&resource=aiplatform.googleapis.com%252FEndpoint&advancedFilter=resource.type%3D%22aiplatform.googleapis.com%2FEndpoint%22%0Aresource.labels.endpoint_id%3D%222311677017910673408%22%0Aresource.labels.location%3D%22us-central1%22. 9: Model server terminated: model server container terminated: go/nodeserialize   
exit_code: 1
reason: "Error"
started_at {
  seconds: 1709780000
}
finished_at {
  seconds: 1709780011
}
. Model server logs can be found at https://console.cloud.google.com/logs/viewer?project=255766800726&resource=aiplatform.googleapis.com%252FEndpoint&advancedFilter=resource.type%3D%22aiplatform.googleapis.com%2FEndpoint%22%0Aresource.labels.endpoint_id%3D%222311677017910673408%22%0Aresource.labels.location%3D%22us-central1%22.

NOTE: If you see a `ServiceUnavailable: 503 502:Bad Gateway` error when you send requests to the endpoint, the model server is likely still initializing. Please retry later.

NOTE: If you receive `InternalServerError: 500 System error` during the deployment, most likely the operation failed due to unavailability of resources. Either retry or use a different accelerator type.

Once deployment succeeds, you can send requests to the endpoint with text prompts.

### Run sample prompt

## Deploy Prebuilt Mixtral 8x7B model with vLLM

This section deploys the prebuilt Mixtral 8x7B model with [vLLM](https://github.com/vllm-project/vllm) on a Vertex endpoint. The model deployment step will take ~40 minutes to complete.

vLLM is a highly optimized LLM serving framework which can significantly increase serving throughput. The higher QPS you have, the more benefits you get using vLLM.

Set the prebuilt model id.

In [None]:
# Loads an existing endpoint instance using the endpoint name:
# - Using `endpoint_name = endpoint.name` allows us to get the endpoint name of
#   the endpoint `endpoint` created in the cell above.
# - Alternatively, you can set `endpoint_name = "1234567890123456789"` to load
#   an existing endpoint with the ID 1234567890123456789.
# You may uncomment the code below to load an existing endpoint.

# endpoint_name = endpoint_without_peft.name
# # endpoint_name = ""  # @param {type:"string"}
# aip_endpoint_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
# )
# endpoint = aiplatform.Endpoint(aip_endpoint_name)

instances = [
    {
        "prompt": "My favourite condiment is",
        "n": 1,
        "max_tokens": 200,
        "temperature": 1.0,
        "top_p": 1.0,
        "top_k": 10,
    },
]
response = endpoint.predict(instances=instances)

for prediction in response.predictions:
    print(prediction)

# Reference the following code for using the OpenAI vLLM server.
# import json
# response = endpoint.raw_predict(
#     body=json.dumps({
#         "model": prebuilt_model_id,
#         "prompt": "My favourite condiment is",
#         "n": 1,
#         "max_tokens": 200,
#         "temperature": 1.0,
#         "top_p": 1.0,
#         "top_k": 10,
#     }),
#     headers={"Content-Type": "application/json"},
# )
# print(response.json())

## Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial.