In [1]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden - LLAMA 3, Mistral and Mixtral 8x7B Models

<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_mistral.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_mistral.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/notebooks/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/community/model_garden/model_garden_pytorch_mistral.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
Open in Vertex AI Workbench
    </a> (A Python-3 CPU notebook is recommended)
  </td>
</table>

## Overview

This notebook demonstrates deploying prebuilt [Mistral](https://mistral.ai/) and Mixtral 8x7B models in Vertex AI.

### Objective

- Deploy prebuilt [Mistral models](https://huggingface.co/mistralai) with [vLLM](https://github.com/vllm-project/vllm) containers
    - [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1): pretrained generative text model with 7 billion parameters
    - [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1): Instruction fine-tuned version of the Mistral-7B-v0.1 generative text model
- Deploy prebuit [Mixtral 8x7B model](https://huggingface.co/mistralai) with [vLLM](https://github.com/vllm-project/vllm) containers
    - [mistralai/Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1): pretrained Mixture of Experts (MoE) model with 8 branches
    - [mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1): Instruction fine-tuned version of the Mixture of Experts (MoE) model with 8 branches

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), [Cloud NL API pricing](https://cloud.google.com/natural-language/pricing) and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Before you begin

### Install dependencies

In [2]:
! pip3 install -q --upgrade google-cloud-aiplatform
! pip3 install -q transformers==4.36.0
! pip3 install -q accelerate==0.23.0
! pip3 install -q gdown

### Setup Google Cloud project

1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.

1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

1. [Enable the Vertex AI API, Compute Engine API and Cloud Natural Language API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com,compute_component,language.googleapis.com).

1. [Create a Cloud Storage bucket](https://cloud.google.com/storage/docs/creating-buckets) for storing experiment outputs.

1. [Create a service account](https://cloud.google.com/iam/docs/service-accounts-create#iam-service-accounts-create-console) with `Vertex AI User` and `Storage Object Admin` roles for deploying fine tuned model to Vertex AI endpoint.

### Define environment variables

Set the following variables for the experiment environment. The specified Cloud Storage bucket (`BUCKET_URI`) should be located in the specified region (`REGION`). Note that a multi-region bucket (eg. "us") is not considered a match for a single region covered by the multi-region range (eg. "us-central1").

In [3]:
import socket
import re

UNIQUE_PREFIX = socket.gethostname()
UNIQUE_PREFIX = re.sub('[^A-Za-z0-9]+', '', UNIQUE_PREFIX)

# Cloud project id.
PROJECT_IDS = !(gcloud config get-value core/project)
PROJECT_ID = PROJECT_IDS[0]  # @param {type:"string"}

# The region you want to launch jobs in.
# Select region based on the accelerators and regions supported by Vertex AI Prediction
# https://cloud.google.com/vertex-ai/docs/predictions/configure-compute.
REGION = "asia-southeast1"  # @param {type:"string"}

# The Cloud Storage bucket for storing experiments output.
# Remove prefix gs://, e.g. foo_bucket.
BUCKET_NAME = f"{PROJECT_ID}-{UNIQUE_PREFIX}-{REGION}"

# The service account looks like:
# '@.iam.gserviceaccount.com'
# Please go to https://cloud.google.com/iam/docs/service-accounts-create#iam-service-accounts-create-console
# and create service account with `Vertex AI User` and `Storage Object Admin` roles.
# The service account for deploying fine tuned model.
SERVICE_ACCOUNT = !(gcloud config get-value core/account)  # @param {type:"string"}
SERVICE_ACCOUNT = SERVICE_ACCOUNT[0]  # @param {type:"string"}

BUCKET_URI = f"gs://{BUCKET_NAME}"  # @param {type:"string"}

! gcloud config set project $PROJECT_ID
! gcloud storage buckets create {BUCKET_URI} --project={PROJECT_ID} --location={REGION}

import os

STAGING_BUCKET = os.path.join(BUCKET_URI, "temporal")

Updated property [core/project].
Creating gs://mythical-lens-406709-mistraltest1-asia-southeast1/...


In [4]:
MODEL_BUCKET = os.path.join(BUCKET_URI, "llama3")
!gsutil -m cp -R gs://vertex-model-garden-public-us/llama3 {MODEL_BUCKET}

Copying gs://vertex-model-garden-public-us/llama3/llama3-70b-chat-hf/LICENSE [Content-Type=application/octet-stream]...
Copying gs://vertex-model-garden-public-us/llama3/llama3-70b-chat-hf/README.md [Content-Type=text/markdown]...
Copying gs://vertex-model-garden-public-us/llama3/llama3-70b-chat-hf/config.json [Content-Type=application/json]...
Copying gs://vertex-model-garden-public-us/llama3/llama3-70b-chat-hf/USE_POLICY.md [Content-Type=text/markdown]...
Copying gs://vertex-model-garden-public-us/llama3/llama3-70b-chat-hf/generation_config.json [Content-Type=application/json]...
Copying gs://vertex-model-garden-public-us/llama3/llama3-70b-chat-hf/model-00005-of-00030.safetensors [Content-Type=application/octet-stream]...
Copying gs://vertex-model-garden-public-us/llama3/llama3-70b-chat-hf/model-00002-of-00030.safetensors [Content-Type=application/octet-stream]...
Copying gs://vertex-model-garden-public-us/llama3/llama3-70b-chat-hf/model-00011-of-00030.safetensors [Content-Type=appli

### Initialize Vertex AI API

In [10]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

### Define common functions

In [11]:
import os
from datetime import datetime
from typing import Tuple

from google.cloud import aiplatform


def get_job_name_with_datetime(prefix: str) -> str:
    """Gets the job name with date time when triggering training or deployment
    jobs in Vertex AI.
    """
    return prefix + datetime.now().strftime("_%Y%m%d_%H%M%S")


def deploy_model_vllm(
    model_name: str,
    model_id: str,
    service_account: str,
    machine_type: str = "g2-standard-8",
    accelerator_type: str = "NVIDIA_L4",
    accelerator_count: int = 1,
    gpu_memory_utilization: float = 0.9,
    max_model_len: int = 4096,
    use_openai_server: bool = False,
    use_chat_completions_if_openai_server: bool = False,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys Mistral models with vLLM on Vertex AI.

    Args:
        model_name: Display name of the model.
        model_id: Model ID or path to model weights.
        service_account: Service account for model uploading and deployment.
        machine_type: Deployment machine type.
        accelerator_type: Deployment accelerator type.
        accelerator_count: Number of accelerators to use.
        max_model_len: Maximum model length.
        gpu_memory_utilization: Fraction of GPU memory to be used for the model
        executor.
        use_openai_server: Whether to use the OpenAI-format vLLM model server.
        use_chat_completions_if_openai_server: If the OpenAI model server is
            used, whether to use the chat completion API as opposed to the text
            completion API. The vLLM text completion API mimics the OpenAI text
            completion API:
            https://platform.openai.com/docs/api-reference/completions/create.
            It has two required parameters: the model ID to direct requests to
            and the prompt. The response includes a "choices" field that
            contains the generated text and a "usage" field that contains token
            counts. The vLLM chat completion API mimics the OpenAI chat
            completion API:
            https://platform.openai.com/docs/api-reference/chat/create. It has
            two required parameters: the model ID to direct requests to and
            "messages" which is a sequence of system/user/assistant/tool
            messages that can represent a multi-turn chat conversation. The
            response includes a "choices" field that contains the generated
            message from a role and a "usage" field that contains token counts.

    Returns:
        Model instance and endpoint instance.
    """
    endpoint = aiplatform.Endpoint.create(display_name=f"{model_name}-endpoint")

    if "llama" in model_id:
        VLLM_DOCKER_URI="us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20240418_0936_RC01"
    else:
        VLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20240313_0916_RC00"
    
    dtype = "bfloat16"
    if accelerator_type in ["NVIDIA_TESLA_T4", "NVIDIA_TESLA_V100"]:
        dtype = "float16"

    vllm_args = [
        "--host=0.0.0.0",
        "--port=7080",
        f"--model={model_id}",
        f"--tensor-parallel-size={accelerator_count}",
        "--swap-space=16",
        f"--dtype={dtype}",
        f"--gpu-memory-utilization={gpu_memory_utilization}",
        f"--max-model-len={max_model_len}",
        "--disable-log-stats",
    ]
    
    serving_env = {
            "MODEL_ID": model_id,
        }
    
    if use_openai_server:
        if use_chat_completions_if_openai_server:
            serving_container_predict_route = "/v1/chat/completions"
        else:
            serving_container_predict_route = "/v1/completions"
    else:
        serving_container_predict_route = "/generate"
    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=VLLM_DOCKER_URI,
        serving_container_command=[
            "python",
            "-m",
            (
                "vllm.entrypoints.openai.api_server"
                if use_openai_server
                else "vllm.entrypoints.api_server"
            ),
        ],
        serving_container_args=vllm_args,
        serving_container_ports=[7080],
        serving_container_predict_route=serving_container_predict_route,
        serving_container_health_route="/health",
        serving_container_environment_variables=serving_env,
    )

    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=1800,
        service_account=service_account,
    )
    return model, endpoint

## Deploy Prebuilt LLAMA 3 model with vLLM
This section uploads prebuilt LLaMA3 models to Model Registry and deploys it to a Vertex AI Endpoint. It takes 15 minutes to 1 hour to finish depending on the size of the model.

NVIDIA_L4 GPUs are used for demonstration. The serving efficiency of L4 GPUs is inferior to that of A100 GPUs, but L4 GPUs are nevertheless good serving solutions if you do not have A100 quota.

In [12]:
base_model_name = "llama3-70b-hf"  # @param ["llama3-8b-hf", "llama3-8b-chat-hf", "llama3-70b-hf", "llama3-70b-chat-hf"] {isTemplate:true}
model_id = os.path.join(MODEL_BUCKET, base_model_name)

Find Vertex AI prediction supported accelerators and regions at https://cloud.google.com/vertex-ai/docs/predictions/configure-compute.

In [13]:
accelerator_type = "NVIDIA_L4"  # @param ["NVIDIA_L4", "NVIDIA_TESLA_A100"]
gpu_memory_utilization = 0.85
max_model_len = 8192  # Maximum context length.

def deploy_model_with_config(machine_type, accelerator_type, accelerator_count ):
        model, endpoint = deploy_model_vllm(
        model_name=get_job_name_with_datetime(prefix="llama3-serve"),
        model_id=model_id,
        service_account=SERVICE_ACCOUNT,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        gpu_memory_utilization=gpu_memory_utilization,
        max_model_len=max_model_len,
        )
    
        return(model, endpoint)

if "8b" in base_model_name:
    try:
        # L4 serving is more cost efficient than V100 serving.
        llama_model, llama_endpoint = deploy_model_with_config(
            machine_type = "g2-standard-8", 
            accelerator_type = "NVIDIA_L4", 
            accelerator_count = 1
        )
    except Exception as e:
        print(f"Error: {e} ")
        try:
            llama_model, llama_endpoint = deploy_model_with_config(
                machine_type = "a2-highgpu-1g", 
                accelerator_type = "NVIDIA_TESLA_A100", 
                accelerator_count = 1
            )
        except Exception as e:
            print(f"Error: {e} ")
            print(f"Recommended GPU setting not found for: {accelerator_type} and {base_model_name}.")
elif "70b" in base_model_name:
    # If you do not have access to 4 A100 (40G) GPUs, you may serve LLaMA3 70B
    # models with 8 L4 (24G) GPUs.
    # Note that with the default timeout threshold of Vertex endpoints, you should
    # set a `max_tokens` configuration of around 1,000 tokens or fewer. If you need
    # longer generated sequences, please file a request with Vertex to allowlist
    # your project for a longer timeout threshold with Vertex endpoints.
    try: 
        llama_model, llama_endpoint = deploy_model_with_config(
                machine_type = "g2-standard-96", 
                accelerator_type = "NVIDIA_L4", 
                accelerator_count = 8
        )
    except Exception as e:
        print(f"Error: {e} ")
        try:
            llama_model, llama_endpoint = deploy_model_with_config(
                machine_type = "a2-highgpu-4g", 
                accelerator_type = "NVIDIA_TESLA_A100", 
                accelerator_count = 4
            )
        except Exception as e:
            print(f"Error: {e} ")
            print(f"Recommended GPU setting not found for: {accelerator_type} and {base_model_name}.")
else:
    print(f"Recommended GPU setting not found for: {accelerator_type} and {base_model_name}.")

Creating Endpoint
Create Endpoint backing LRO: projects/599987431781/locations/asia-southeast1/endpoints/8298154992313827328/operations/3284767898236616704
Endpoint created. Resource name: projects/599987431781/locations/asia-southeast1/endpoints/8298154992313827328
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/599987431781/locations/asia-southeast1/endpoints/8298154992313827328')
Creating Model
Create Model backing LRO: projects/599987431781/locations/asia-southeast1/models/8793586135696670720/operations/7657763136413368320
Model created. Resource name: projects/599987431781/locations/asia-southeast1/models/8793586135696670720@1
To use this Model in another session:
model = aiplatform.Model('projects/599987431781/locations/asia-southeast1/models/8793586135696670720@1')
Deploying model to Endpoint : projects/599987431781/locations/asia-southeast1/endpoints/8298154992313827328
Deploy Endpoint model backing LRO: projects/599987431781/locations/asia-sou

### Predict
Once deployment succeeds, you can send requests to the endpoint with text prompts. Sampling parameters supported by vLLM can be found [here](https://github.com/vllm-project/vllm/blob/2e8e49fce3775e7704d413b2f02da6d7c99525c9/vllm/sampling_params.py#L23-L64).
Example:
```
Human: What is a car?
Assistant:  A car, or a motor car, is a road-connected human-transportation system used to move people or goods from one place to another. The term also encompasses a wide range of vehicles, including motorboats, trains, and aircrafts. Cars typically have four wheels, a cabin for passengers, and an engine or motor. They have been around since the early 19th century and are now one of the most popular forms of transportation, used for daily commuting, shopping, and other purposes.
```
Additionally, you can moderate the generated text with Vertex AI. See [Moderate text documentation](https://cloud.google.com/natural-language/docs/moderating-text) for more details.

In [29]:
prompt = "What is an apple?"  # @param {type: "string"}: Text prompt that Mistral will generate
max_tokens = 50  # @param {type:"integer"}:  Word limit of text generated
temperature = 1.0  # @param {type:"number"}: Randomness of the text
top_p = 1.0  # @param {type:"number"}: probability distribution
top_k = 1  # @param {type:"integer"}: number of continuations to consider at each step when building the sequence
raw_response = False  # @param {type:"boolean"}: whether to use raw response

# Overides parameters for inferences.
# If you encounter the issue like `ServiceUnavailable: 503 Took too long to respond when processing`,
# you can reduce the maximum number of output tokens, such as set max_tokens as 20.
instances = [
    {
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "top_p": top_p,
        "top_k": top_k,
        "raw_response": raw_response,
    },
]
response = llama_endpoint.predict(instances=instances)

for prediction in response.predictions:
    print(prediction)

Prompt:
What is an apple?
Output:
 It is a fruit. It is a fruit that grows on a tree. It is a fruit that grows on a tree that is called an apple tree. It is a fruit that grows on a tree that is called an apple tree that is grown in


## Deploy Prebuilt Mistral model with vLLM

This section deploys the prebuilt Mistral model with [vLLM](https://github.com/vllm-project/vllm) on a Vertex endpoint. The model deployment step will take ~15 minutes to complete.

vLLM is a highly optimized LLM serving framework which can significantly increase serving throughput. The higher QPS you have, the more benefits you get using vLLM.

Set the prebuilt model id.

In [15]:
# prebuilt_model_id = "mistralai/Mistral-7B-Instruct-v0.2"  # @param ["mistralai/Mistral-7B-v0.1", "mistralai/Mistral-7B-Instruct-v0.1"]

In [16]:
# # Find Vertex AI prediction supported accelerators and regions in
# # https://cloud.google.com/vertex-ai/docs/predictions/configure-compute.

# # Sets 1 L4 to deploy Mistral 7B.
# machine_type = "g2-standard-8"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 1

# # Sets 2 V100s to deploy Mistral 7B.
# # machine_type = "n1-standard-16"
# # accelerator_type = "NVIDIA_TESLA_V100"
# # accelerator_count = 2

# # Sets 2 T4s to deploy Mistral 7B.
# # machine_type = "n1-standard-16"
# # accelerator_type = "NVIDIA_TESLA_T4"
# # accelerator_count = 2

# # Sets 1 A100 (40G) to deploy Mistral 7B.
# # machine_type = "a2-highgpu-1g"
# # accelerator_type = "NVIDIA_TESLA_A100"
# # accelerator_count = 1

# # Larger setting of `max-model-len` can lead to higher requirements on
# # `gpu-memory-utilization` and GPU configuration.
# max_model_len = 4096
# gpu_memory_utilization = 0.9

# def deploy_model_with_config(machine_type, accelerator_type, accelerator_count ):
    
#     model, endpoint = deploy_model_vllm(
#     model_name=get_job_name_with_datetime(prefix="mistral-serve-vllm"),
#     model_id=prebuilt_model_id,
#     service_account=SERVICE_ACCOUNT,
#     machine_type=machine_type,
#     accelerator_type=accelerator_type,
#     accelerator_count=accelerator_count,
#     gpu_memory_utilization=gpu_memory_utilization,
#     max_model_len=max_model_len,
#     use_openai_server=True,
#     use_chat_completions_if_openai_server=False,
#     )
#     return(model, endpoint)

# try:
#     # Code that might potentially cause an error
#     machine_type = "a2-highgpu-1g"
#     accelerator_type = "NVIDIA_TESLA_A100"
#     accelerator_count = 1
#     model, mistral_endpoint = deploy_model_with_config(machine_type, accelerator_type, accelerator_count )
# except Exception as e:  # Replace 'ErrorType' with the specific error you want to catch 
#     print(f"Error: {e} ")
#     try : 
#         machine_type = "g2-standard-8"
#         accelerator_type = "NVIDIA_L4"
#         accelerator_count = 1
#         model, mistral_endpoint = deploy_model_with_config(machine_type, accelerator_type, accelerator_count )
#     except Exception as e:
#         print("Error may becaused due to machine unavailable - A100s - trying T4s ")
#         print(f"Error: {e} ")
#         machine_type = "n1-standard-16"
#         accelerator_type = "NVIDIA_TESLA_T4"
#         accelerator_count = 2
#         model, mistral_endpoint = deploy_model_with_config(machine_type, accelerator_type, accelerator_count )
# else:
#     # Code to execute if there's no error in the 'try' block
#     print("Error may becaused due to machine unavailable of any type ")

NOTE: If you see a `ServiceUnavailable: 503 502:Bad Gateway` error when you send requests to the endpoint, the model server is likely still initializing. Please retry later.

NOTE: If you receive `InternalServerError: 500 System error` during the deployment, most likely the operation failed due to unavailability of resources. Either retry or use a different accelerator type.

Once deployment succeeds, you can send requests to the endpoint with text prompts.

### Run sample prompt

Once deployment succeeds, you can send requests to the endpoint with text prompts. Sampling parameters supported by vLLM can be found [here](https://github.com/vllm-project/vllm/blob/2e8e49fce3775e7704d413b2f02da6d7c99525c9/vllm/sampling_params.py#L23-L64).

In [17]:
# prompt="My favourite condiment is" # @param {type: "string"}: Text prompt that Mistral will generate
# n=1 # @param {type:"integer"}:  Number of responses to generate
# max_tokens=200 # @param {type:"integer"}:  Word limit of text generated
# temperature=1.0 # @param {type:"number"}: Randomness of the text
# top_p=1.0 # @param {type:"number"}: probability distribution
# top_k=1.0 # @param {type:"integer"}: number of continuations to consider at each step when building the sequence

# instances = [
#     {
#         "prompt": prompt,
#         "n": n,
#         "max_tokens": max_tokens,
#         "temperature": temperature,
#         "top_p": top_p,
#         "top_k": top_k,
#     },
# ]
# response = mistral_endpoint.predict(instances=instances)

# for prediction in response.predictions:
#     print(prediction)

In [18]:
# Loads an existing endpoint instance using the endpoint name:
# - Using `endpoint_name = endpoint.name` allows us to get the endpoint name of
#   the endpoint `endpoint` created in the cell above.
# - Alternatively, you can set `endpoint_name = "1234567890123456789"` to load
#   an existing endpoint with the ID 1234567890123456789.
# You may uncomment the code below to load an existing endpoint.

# endpoint_name = endpoint_without_peft.name
# # endpoint_name = ""  # @param {type:"string"}
# aip_endpoint_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
# )
# endpoint = aiplatform.Endpoint(aip_endpoint_name)

# Reference the following code for using the OpenAI vLLM server.
# import json
# response = endpoint.raw_predict(
#     body=json.dumps({
#         "model": prebuilt_model_id,
#         "prompt": "My favourite condiment is",
#         "n": 1,
#         "max_tokens": 200,
#         "temperature": 1.0,
#         "top_p": 1.0,
#         "top_k": 10,
#     }),
#     headers={"Content-Type": "application/json"},
# )
# print(response.json())

## Deploy Prebuilt Mixtral 8x7B model with vLLM

This section deploys the prebuilt Mixtral 8x7B model with [vLLM](https://github.com/vllm-project/vllm) on a Vertex endpoint. The model deployment step will take ~40 minutes to complete.

vLLM is a highly optimized LLM serving framework which can significantly increase serving throughput. The higher QPS you have, the more benefits you get using vLLM.

Set the prebuilt model id.

In [19]:
# prebuilt_model_id = "mistralai/Mixtral-8x7B-v0.1"  # @param ["mistralai/Mixtral-8x7B-v0.1", "mistralai/Mixtral-8x7B-Instruct-v0.1"]

NOTE: If you see a `ServiceUnavailable: 503 502:Bad Gateway` error when you send requests to the endpoint, the model server is likely still initializing. Please retry later.

NOTE: If you receive `InternalServerError: 500 System error` during the deployment, most likely the operation failed due to unavailability of resources. Either retry or use a different accelerator type.

Once deployment succeeds, you can send requests to the endpoint with text prompts.

In [20]:
# # Find Vertex AI prediction supported accelerators and regions in
# # https://cloud.google.com/vertex-ai/docs/predictions/configure-compute.

# # Sets 8 L4s to deploy Mixtral 8x7B.
# machine_type = "g2-standard-96"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 8

# # Sets 4 A100s (40G) to deploy Mixtral 8x7B.
# # machine_type = "a2-highgpu-4g"
# # accelerator_type = "NVIDIA_TESLA_A100"
# # accelerator_count = 4
# def deploy_model_with_config(machine_type, accelerator_type, accelerator_count ):
    
#     model, endpoint = deploy_model_vllm(
#     model_name=get_job_name_with_datetime(prefix="mixtral-serve-vllm"),
#     model_id=prebuilt_model_id,
#     service_account=SERVICE_ACCOUNT,
#     machine_type=machine_type,
#     accelerator_type=accelerator_type,
#     accelerator_count=accelerator_count,
#     )
#     return(model, endpoint)

# try:
#     # Code that might potentially cause an error
#     machine_type = "g2-standard-96"
#     accelerator_type = "NVIDIA_L4"
#     accelerator_count = 8
#     model, mixtral_endpoint = deploy_model_with_config(machine_type, accelerator_type, accelerator_count )
# except :  # Replace 'ErrorType' with the specific error you want to catch 
#     try : 
#         print("Trying  A100s - in the region ")
#         machine_type = "a2-highgpu-4g"
#         accelerator_type = "NVIDIA_TESLA_A100"
#         accelerator_count = 4
#         model, mixtral_endpoint = deploy_model_with_config(machine_type, accelerator_type, accelerator_count )
#     except :
#         print("Error may becaused due to machine unavailable - A100s - trying T4s ")
#         machine_type = "n1-standard-96"
#         accelerator_type = "NVIDIA_TESLA_T4"
#         accelerator_count = 8
#         model, mixtral_endpoint = deploy_model_with_config(machine_type, accelerator_type, accelerator_count )
# else:
#     # Code to execute if there's no error in the 'try' block
#     print("Error may becaused due to machine unavailable of any type ")


### Run sample prompt

Try different prompts and configurations to see the results!
For reference, here is a quick explaination of each configurations
* prompt: Text prompt that Mistral will generate
* n: Number of responses to generate
* max tokens: Word limit of text generated
* temperature: Randomness of the text
* top p: probability distribution
* top k: number of continuations to consider at each step when building the sequence

In [21]:
# prompt="What is a car?"
# max_tokens=50
# temperature=1.0
# top_p=1.0
# top_k=10

# instances = [
#     {
#         "prompt": prompt,
#         "max_tokens": max_tokens,
#         "temperature": temperature,
#         "top_p": top_p,
#         "top_k": top_k,
#     },
# ]
# response = mixtral_endpoint.predict(instances=instances)

# for prediction in response.predictions:
#     print(prediction)

## Run inferences locally with prebuilt Mistral and Mixtral models

You will need at least 24GB of memory to run inference with Mistral-7B. You can run locally or on Vertex AI Prediction endpoints with any of the following specs:
- g2-standard-8 with 1 L4 GPU
- n1-standard-16 with 2 V100 GPUs
- n1-standard-16 with 2 T4 GPUs
- a2-highgpu-1g with 1 A100 GPU

You will need at least 96GB of memory to run inference with Mixtral 8x7B. You can run locally or on Vertex AI Prediction endpoints with any of the following specs:
- g2-standard-96 with 8 L4 GPUs
- n1-standard-32 with 8 V100 GPUs
- n1-standard-32 with 8 T4 GPUs
- a2-highgpu-4g with 4 A100 GPUs

In [22]:
# %%time
# import torch
# import transformers
# from transformers import AutoModelForCausalLM, AutoTokenizer

# device = "cuda"  # the device to load the model onto
# model_name = "mistralai/Mistral-7B-v0.1"  # @param ["mistralai/Mistral-7B-v0.1", "mistralai/Mistral-7B-Instruct-v0.1", "mistralai/Mixtral-8x7B-v0.1", "mistralai/Mixtral-8x7B-Instruct-v0.1"]
# model = AutoModelForCausalLM.from_pretrained(
#     model_name, device_map="auto", return_dict=True, torch_dtype=torch.float16
# )
# tokenizer = AutoTokenizer.from_pretrained(model_name)

# pipeline = transformers.pipeline("text-generation", model=model, tokenizer=tokenizer)

# prompt = "My favourite condiment is"

# sequences = pipeline(
#     prompt,
#     max_length=200,
#     do_sample=True,
#     top_k=10,
#     num_return_sequences=1,
#     eos_token_id=tokenizer.eos_token_id,
# )

# for seq in sequences:
#     print(f"Result: {seq['generated_text']}")

## Cleaning up
You can delete the individual resources you created in this tutorial.

### Undeploy models and Delete endpoints

Set this flag to delete endpoint including undeploying models

In [23]:
# delete_endpoint = True

In [24]:
# def list_endpoints():
#     return [
#         (r.name, r.display_name)
#         for r in aiplatform.Endpoint.list()
# #        if r.display_name.startswith("mistral-serve-vllm")
#     ]

Delete the endpoint using the Vertex AI fully qualified identifier for the endpoint

In [25]:
# try:
#     if delete_endpoint:
#         endpoints = list_endpoints()
#         for endpoint_id, endpoint_name in endpoints:
#             endpoint = aiplatform.Endpoint(endpoint_id)
#             print(
#                 f"Undeploying all deployed models and deleting endpoint {endpoint_id} [{endpoint_name}]"
#             )
#             endpoint.delete(force=False)
# except Exception as e:
#     print(e)

### Delete Cloud Storage bucket (Not needed)

In [26]:
#import os

#delete_bucket = False

#job.delete()

#if delete_bucket or os.getenv("ID_TESTING"):
#    ! gsutil rm -rf {BUCKET_URI}