In [8]:
import os
import sys
from datetime import datetime
from typing import Tuple

from google.cloud import aiplatform, language

In [9]:
# Cloud project id.
PROJECT_ID = "sbx-196865-genaift-ds-ccd784e6"  # @param {type:"string"}

# The Hugging Face User Access Token.
HF_TOKEN = "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"  # @param {type:"string"}

# Region for launching jobs.
REGION = "us-central1"  # @param {type:"string"}

# Cloud Storage bucket for storing experiments output.
# Start with gs:// prefix, e.g. gs://foo_bucket.
BUCKET_URI = "gs://19865_finetuned_models"  # @param {type:"string"}

! gcloud config set project $PROJECT_ID
! gcloud services enable language.googleapis.com


STAGING_BUCKET = os.path.join(BUCKET_URI, "temporal")
EXPERIMENT_BUCKET = os.path.join(BUCKET_URI, "peft")
MODEL_BUCKET = os.path.join(BUCKET_URI, "model")

# The service account looks like:
# '@.iam.gserviceaccount.com'
# Please go to https://cloud.google.com/iam/docs/service-accounts-create#iam-service-accounts-create-console
# and create service account with `Vertex AI User` and `Storage Object Admin` roles.
# The service account for deploying fine tuned model.
SERVICE_ACCOUNT = "sa-196865-big-data@sbx-196865-genaift-ds-ccd784e6.iam.gserviceaccount.com"  # @param {type:"string"}

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user(project_id=PROJECT_ID)

Updated property [core/project].


In [10]:
base_model = "google/gemma-7b-it"
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

In [11]:
# The pre-built training and serving docker images.
TRAIN_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-peft-train:20240220_0936_RC01"
TGI_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-hf-tgi-serve:20240220_0936_RC01"

In [12]:
def get_job_name_with_datetime(prefix: str) -> str:
    """Gets the job name with date time when triggering training or deployment
    jobs in Vertex AI.
    """
    return prefix + datetime.now().strftime("_%Y%m%d_%H%M%S")


def moderate_text(text: str) -> language.ModerateTextResponse:
    """Calls Vertex AI APIs to analyze text moderations."""
    client = language.LanguageServiceClient()
    document = language.Document(
        content=text,
        type_=language.Document.Type.PLAIN_TEXT,
    )
    return client.moderate_text(document=document)


def show_text_moderation(text: str, response: language.ModerateTextResponse) -> None:
    """Shows text moderation results."""
    import pandas as pd

    def confidence(category: language.ClassificationCategory) -> float:
        return category.confidence

    columns = ["category", "confidence"]
    categories = sorted(response.moderation_categories, key=confidence, reverse=True)
    data = ((category.name, category.confidence) for category in categories)
    df = pd.DataFrame(columns=columns, data=data)

    print(f"Text analyzed:\n{text}")
    print(df.to_markdown(index=False, tablefmt="presto", floatfmt=".0%"))


def deploy_model_tgi(
    model_name: str,
    model_id: str,
    service_account: str,
    machine_type: str = "g2-standard-8",
    accelerator_type: str = "NVIDIA_L4",
    accelerator_count: int = 1,
    max_input_length: int = 512,
    max_total_tokens: int = 2048,
    max_batch_prefill_tokens: int = 2048,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys models with TGI on GPU in Vertex AI."""
    endpoint = aiplatform.Endpoint.create(display_name=f"{model_name}-endpoint")

    env_vars = {
        "MODEL_ID": model_id,
        "NUM_SHARD": f"{accelerator_count}",
        "MAX_INPUT_LENGTH": f"{max_input_length}",
        "MAX_TOTAL_TOKENS": f"{max_total_tokens}",
        "MAX_BATCH_PREFILL_TOKENS": f"{max_batch_prefill_tokens}",
    }

    if HF_TOKEN:
        env_vars["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN

    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=TGI_DOCKER_URI,
        serving_container_ports=[80],
        serving_container_environment_variables=env_vars,
        serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
    )

    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=1800,
        service_account=service_account,
    )
    return model, endpoint

In [13]:
# Huggingface dataset name or gs:// URI to a custom JSONL dataset.
dataset_name = "gs://19865_finetuned_models/training_data/data.jsonl"  # @param {type:"string"}
# Name of the dataset column containing training text input.
instruct_column_in_dataset = "text"  # @param {type:"string"}
# Optional. Template name or gs:// URI to a custom template.
template = ""  # @param {type:"string"}
# Batch size for finetuning.
per_device_train_batch_size = 1  # @param{type:"integer"}
# Runs 10 training steps as a minimal example.
max_steps = 10  # @param {type:"integer"}
# Precision mode for finetuning.
finetuning_precision_mode = "float16"  # @param["4bit", "8bit", "float16"]

# Worker pool spec.

# Finetunes Gemma with 1 L4 (24G).
# machine_type = "g2-standard-8"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 1
# Finetunes Gemma with 1 V100 (16G).
machine_type = "n1-standard-8"
accelerator_type = "NVIDIA_TESLA_V100"
accelerator_count = 1

replica_count = 1

In [14]:
# Setup training job.
job_name = get_job_name_with_datetime("gemma-lora-train")

# Pass training arguments and launch job.
train_job = aiplatform.CustomContainerTrainingJob(
    display_name=job_name,
    container_uri=TRAIN_DOCKER_URI,
)

# Create a GCS folder to store the LORA adapter.
lora_adapter_dir = get_job_name_with_datetime("gemma-lora-adapter")
lora_output_dir = os.path.join(MODEL_BUCKET, lora_adapter_dir)

# Create a GCS folder to store the merged model with the base model and the
# finetuned LORA adapter.
merged_model_dir = get_job_name_with_datetime("gemma-merged-model")
merged_model_output_dir = os.path.join(MODEL_BUCKET, merged_model_dir)

train_job.run(
    args=[
        "--task=instruct-lora",
        f"--pretrained_model_id={base_model}",
        f"--dataset_name={dataset_name}",
        f"--instruct_column_in_dataset={instruct_column_in_dataset}",
        f"--output_dir={lora_output_dir}",
        f"--merge_base_and_lora_output_dir={merged_model_output_dir}",
        f"--per_device_train_batch_size={per_device_train_batch_size}",
        "--lora_rank=16",
        "--lora_alpha=64",
        "--lora_dropout=0.1",
        f"--max_steps={max_steps}",
        "--max_seq_length=512",
        "--learning_rate=2e-4",
        f"--precision_mode={finetuning_precision_mode}",
        f"--template={template}",
        f"--huggingface_access_token={HF_TOKEN}",
    ],
    environment_variables={"WANDB_DISABLED": True},
    replica_count=replica_count,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    boot_disk_size_gb=500,
    service_account=SERVICE_ACCOUNT,
)

print("LoRA adapter was saved in: ", lora_output_dir)
print("Trained and merged models were saved in: ", merged_model_output_dir)

Training Output directory:
gs://19865_finetuned_models/temporal/aiplatform-custom-training-2024-03-07-11:35:09.404 
View Training:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/7029453745071063040?project=81995035742
CustomContainerTrainingJob projects/81995035742/locations/us-central1/trainingPipelines/7029453745071063040 current state:
PipelineState.PIPELINE_STATE_RUNNING
View backing custom job:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/4893157627883683840?project=81995035742
CustomContainerTrainingJob projects/81995035742/locations/us-central1/trainingPipelines/7029453745071063040 current state:
PipelineState.PIPELINE_STATE_RUNNING
CustomContainerTrainingJob projects/81995035742/locations/us-central1/trainingPipelines/7029453745071063040 current state:
PipelineState.PIPELINE_STATE_RUNNING
CustomContainerTrainingJob projects/81995035742/locations/us-central1/trainingPipelines/7029453745071063040 current state:
PipelineS

RuntimeError: Training failed with:
code: 3
message: "The replica workerpool0-0 exited with a non-zero status of 1. To find out more about why your job exited please check the logs: https://console.cloud.google.com/logs/viewer?project=81995035742&resource=ml_job%2Fjob_id%2F4893157627883683840&advancedFilter=resource.type%3D%22ml_job%22%0Aresource.labels.job_id%3D%224893157627883683840%22"


In [None]:
machine_type = "g2-standard-8"
accelerator_type = "NVIDIA_L4"
accelerator_count = 1

# Note that larger token counts will require more GPU memory.
max_input_length = 512
max_total_tokens = 1024
max_batch_prefill_tokens = 2048

model, endpoint = deploy_model_tgi(
    model_name=get_job_name_with_datetime(prefix="gemma-tgi-serve"),
    model_id=merged_model_output_dir,
    service_account=SERVICE_ACCOUNT,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    max_input_length=max_input_length,
    max_total_tokens=max_total_tokens,
    max_batch_prefill_tokens=max_batch_prefill_tokens,
)
print("endpoint_name:", endpoint.name)