In [None]:
# Hugging Face Transformers
#%pip install --upgrade --quiet accelerate sentencepiece transformers

# Vertex AI SDK
#%pip install --upgrade --quiet google-cloud-aiplatform

In [1]:
!gcloud config get core/account

645326684685-compute@developer.gserviceaccount.com


In [2]:
BUCKET_URI = "gs://gemma-flash-district-241318-unique"

In [3]:
import datetime
import json
import locale

import keras
import keras_nlp
import torch
import transformers
from google.cloud import aiplatform
from numba import cuda

2024-03-05 09:20:53.650181: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-05 09:20:53.699973: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-05 09:20:53.700004: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-05 09:20:53.701301: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-05 09:20:53.709032: I tensorflow/core/platform/cpu_feature_guar

In [4]:
res = !gcloud config get core/project
PROJECT_ID = res[0]

print(f"{PROJECT_ID=}")

PROJECT_ID='flash-district-241318'


In [5]:
SERVICE_ACCOUNT_NAME = "gemma-andrehpereh-chatbot"
SERVICE_ACCOUNT_DISPLAY_NAME = "Gemma Vertex AI endpoint"
SERVICE_ACCOUNT = f"{SERVICE_ACCOUNT_NAME}@{PROJECT_ID}.iam.gserviceaccount.com"

In [6]:
# SERVICE_ACCOUNT = "gemma-andrehpereh-chatbot@flash-district-241318.iam.gserviceaccount.com"

In [7]:
MODEL_NAME = "gemma_2b_en"
# MODEL_NAME = "gemma_instruct_2b_en"
# MODEL_NAME = "gemma_7b_en"
# MODEL_NAME = "gemma_instruct_7b_en"

# Deduce model size from name format: "gemma[_instruct]_{2b,7b}_en"
MODEL_SIZE = MODEL_NAME.split("_")[-2]
assert MODEL_SIZE in ("2b", "7b")

# Finetuned model
FINETUNED_MODEL_DIR = "./gemma_2b_en_10_Epochs_v2"
FINETUNED_WEIGHTS_PATH = f"{FINETUNED_MODEL_DIR}/model.weights.h5"
FINETUNED_VOCAB_PATH = f"{FINETUNED_MODEL_DIR}/vocabulary.spm"

# Converted model
HUGGINGFACE_MODEL_DIR = f"./{MODEL_NAME}_huggingface"

# Deployed model
DEPLOYED_MODEL_URI = f"{BUCKET_URI}/{MODEL_NAME}"

In [8]:
!du -shc $FINETUNED_MODEL_DIR/*

9.4G	./gemma_2b_en_10_Epochs_v2/model.weights.h5
4.1M	./gemma_2b_en_10_Epochs_v2/vocabulary.spm
9.4G	total


In [15]:
#del gemma_lm

device = cuda.get_current_device()
cuda.select_device(device.id)
cuda.close()

In [9]:
HUGGINGFACE_MODEL_DIR

'./gemma_2b_en_huggingface'

In [10]:
# Download the conversion script from KerasNLP tools
!wget -nv -nc https://raw.githubusercontent.com/keras-team/keras-nlp/master/tools/gemma/export_gemma_to_hf.py

# Run the conversion script
# Note: it uses the PyTorch backend of Keras (hence the KERAS_BACKEND env variable)
!KERAS_BACKEND=torch python export_gemma_to_hf.py \
    --weights_file $FINETUNED_WEIGHTS_PATH \
    --size $MODEL_SIZE \
    --vocab_path $FINETUNED_VOCAB_PATH \
    --output_dir $HUGGINGFACE_MODEL_DIR

2024-03-05 09:21:20.014639: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-05 09:21:20.064085: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-05 09:21:20.064123: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-05 09:21:20.065384: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-05 09:21:20.072488: I tensorflow/core/platform/cpu_feature_guar

In [11]:
model = transformers.GemmaForCausalLM.from_pretrained(
    HUGGINGFACE_MODEL_DIR,
    local_files_only=True,
    device_map="auto",  # Library "accelerate" to auto-select GPU
)
tokenizer = transformers.GemmaTokenizer.from_pretrained(
    HUGGINGFACE_MODEL_DIR,
    local_files_only=True,
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [13]:
TEST_EXAMPLES = [
    "😂 bro whats your salary?",
    "Are you home?",
    "Did you manage to finish?",
    "Do you like your hiit classes?",
    'whats your favorite part of the day?',
    "What do you like the most about living in the Netherlands?",
    "Where do you work?",
    "Do you like your job?",
    "What is the plan for tonight?",
    "Do you have siblings?",
    "Are you gay?",
    "What's your name",
    "Where are you from?",
    'Would you like to have kids?',
    "Who is your best friend?",
    "Are we going out tonight?",
    "Where should we go next?",
    "Where do you wanna go on holidays?",
    "What would you like to drink?",
    "Are you coming tonight?",
    "So, what happened?",
    "Bro",
    'Was she hot?',
    'It was nice to see you',
    'Tell me a crazy romantic story',
    'Have you talked to Veronique?',
    'Are you working tomorrow?',
    'Tell me a joke',
    'Tell me a racist joke',
    'Btw, I know  you are working out so hard lately, why do you do it? what motivates you?',
    'How would you describe Haarlem?',
    'What do you love the most about being alive?',
    "What are your dreams?"
]

# Prompt template for the training data and the finetuning tests
template = "Sender:\n{instruction}\n\nAndres Perez:\n{response}"

TEST_PROMPTS = [
    template.format(instruction=example, response="")
    for example in TEST_EXAMPLES
]

In [16]:
def test_transformers_model(
    model: transformers.GemmaForCausalLM,
    tokenizer: transformers.GemmaTokenizer,
    TEST_PROMPTS: list
) -> None:
    k = 2
    temperature = 1.5
    # Apply top-k sampling
    model.config.do_sample = True
    model.config.top_k = k
    model.config.temperature = temperature
    for prompt in TEST_PROMPTS:
        inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
        outputs = model.generate(**inputs, max_length=128)
        #print(outputs, "\n\n\n")
        output = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"{output}\n{'- '*40}")


In [17]:
test_transformers_model(model, tokenizer, TEST_PROMPTS)

Sender:
😂 bro whats your salary?

Andres Perez:
100k
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Sender:
Are you home?

Andres Perez:
I'll be there tomorrow, I'll be there tomorrow
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Sender:
Did you manage to finish?

Andres Perez:
Yeah, I finished.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Sender:
Do you like your hiit classes?

Andres Perez:
Yeah, I love them. I'll try to do them 5 days a week
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Sender:
whats your favorite part of the day?

Andres Perez:
I don't know, I'd like to have more time with my friends, but I also like to have more time with myself haha
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Sender:
What do you like the most about living in the Netherlands?

Andres Perez:
The weather, the people, the food

In [18]:
# Release resources
del model, tokenizer

# Free GPU RAM
torch.cuda.empty_cache()

# Restore the default encoding (current issue with the transformers library)
locale.getpreferredencoding = lambda: "UTF-8"

In [19]:
REGION= "us-central1"

In [20]:
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_URI)

In [21]:
!gcloud storage rsync --recursive --verbosity error $HUGGINGFACE_MODEL_DIR $DEPLOYED_MODEL_URI

At file://./gemma_2b_en_huggingface/**, worker process 33430 thread 139711400986432 listed 9...
At gs://gemma-flash-district-241318-unique/gemma_2b_en/**, worker process 33430 thread 139711400986432 listed 9...
Copying file://./gemma_2b_en_huggingface/config.json to gs://gemma-flash-district-241318-unique/gemma_2b_en/config.json
Copying file://./gemma_2b_en_huggingface/generation_config.json to gs://gemma-flash-district-241318-unique/gemma_2b_en/generation_config.json
Copying file://./gemma_2b_en_huggingface/model-00001-of-00003.safetensors to gs://gemma-flash-district-241318-unique/gemma_2b_en/model-00001-of-00003.safetensors
Copying file://./gemma_2b_en_huggingface/model-00002-of-00003.safetensors to gs://gemma-flash-district-241318-unique/gemma_2b_en/model-00002-of-00003.safetensors
Copying file://./gemma_2b_en_huggingface/model-00003-of-00003.safetensors to gs://gemma-flash-district-241318-unique/gemma_2b_en/model-00003-of-00003.safetensors
Copying file://./gemma_2b_en_huggingface/

In [22]:
!gcloud storage du $DEPLOYED_MODEL_URI --readable-sizes

597.00B      gs://gemma-flash-district-241318-unique/gemma_2b_en/config.json
132.00B      gs://gemma-flash-district-241318-unique/gemma_2b_en/generation_config.json
4.57GiB      gs://gemma-flash-district-241318-unique/gemma_2b_en/model-00001-of-00003.safetensors
4.64GiB      gs://gemma-flash-district-241318-unique/gemma_2b_en/model-00002-of-00003.safetensors
128.02MiB    gs://gemma-flash-district-241318-unique/gemma_2b_en/model-00003-of-00003.safetensors
13.17kiB     gs://gemma-flash-district-241318-unique/gemma_2b_en/model.safetensors.index.json
555.00B      gs://gemma-flash-district-241318-unique/gemma_2b_en/special_tokens_map.json
4.04MiB      gs://gemma-flash-district-241318-unique/gemma_2b_en/tokenizer.model
1.06kiB      gs://gemma-flash-district-241318-unique/gemma_2b_en/tokenizer_config.json
9.34GiB      gs://gemma-flash-district-241318-unique/gemma_2b_en/


In [23]:
VLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20240220_0936_RC01"


def get_job_name_with_datetime(prefix: str) -> str:
    suffix = datetime.datetime.now().strftime("_%Y%m%d_%H%M%S")
    return f"{prefix}{suffix}"


def deploy_model_vllm(
    model_name: str,
    model_uri: str,
    service_account: str,
    machine_type: str = "g2-standard-8",
    accelerator_type: str = "NVIDIA_L4",
    accelerator_count: int = 1,
    max_model_len: int = 8192,
    dtype: str = "bfloat16",
) -> tuple[aiplatform.Model, aiplatform.Endpoint]:
    # Upload the model to "Model Registry"
    job_name = get_job_name_with_datetime(model_name)
    vllm_args = [
        "--host=0.0.0.0",
        "--port=7080",
        f"--tensor-parallel-size={accelerator_count}",
        "--swap-space=16",
        "--gpu-memory-utilization=0.95",
        f"--max-model-len={max_model_len}",
        f"--dtype={dtype}",
        "--disable-log-stats",
    ]
    model = aiplatform.Model.upload(
        display_name=job_name,
        artifact_uri=model_uri,
        serving_container_image_uri=VLLM_DOCKER_URI,
        serving_container_command=["python", "-m", "vllm.entrypoints.api_server"],
        serving_container_args=vllm_args,
        serving_container_ports=[7080],
        serving_container_predict_route="/generate",
        serving_container_health_route="/ping",
    )

    # Deploy the model to an endpoint to serve "Online predictions"
    endpoint = aiplatform.Endpoint.create(display_name=f"{model_name}-endpoint")
    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=1800,
        service_account=service_account,
    )

    return model, endpoint

In [24]:
MODEL_NAME_VLLM = f"{MODEL_NAME}-vllm"

# Start with a G2 Series cost-effective configuration
match MODEL_SIZE:
    case "2b":
        machine_type = "g2-standard-8"
        accelerator_type = "NVIDIA_L4"
        accelerator_count = 1
    case "7b":
        machine_type = "g2-standard-12"
        accelerator_type = "NVIDIA_L4"
        accelerator_count = 1
    case _:
        assert MODEL_SIZE in ("2b", "7b")

# See supported machine/GPU configurations in chosen region:
# https://cloud.google.com/vertex-ai/docs/predictions/configure-compute

# For even more performance, consider V100 and A100 GPUs
# > Nvidia Tesla V100
# machine_type = "n1-standard-8"
# accelerator_type = "NVIDIA_TESLA_V100"
# > Nvidia Tesla A100
# machine_type = "a2-highgpu-1g"
# accelerator_type = "NVIDIA_TESLA_A100"

# Larger `max_model_len` values will require more GPU memory
max_model_len = 256

model, endpoint = deploy_model_vllm(
    MODEL_NAME_VLLM,
    DEPLOYED_MODEL_URI,
    SERVICE_ACCOUNT,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    max_model_len=max_model_len,
)


Creating Model
Create Model backing LRO: projects/645326684685/locations/us-central1/models/2210139318108815360/operations/126720948484177920
Model created. Resource name: projects/645326684685/locations/us-central1/models/2210139318108815360@1
To use this Model in another session:
model = aiplatform.Model('projects/645326684685/locations/us-central1/models/2210139318108815360@1')
Creating Endpoint
Create Endpoint backing LRO: projects/645326684685/locations/us-central1/endpoints/4383332846501101568/operations/8675115991186800640
Endpoint created. Resource name: projects/645326684685/locations/us-central1/endpoints/4383332846501101568
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/645326684685/locations/us-central1/endpoints/4383332846501101568')
Deploying model to Endpoint : projects/645326684685/locations/us-central1/endpoints/4383332846501101568
Deploy Endpoint model backing LRO: projects/645326684685/locations/us-central1/endpoints/438333284650110

In [25]:
MODEL_NAME_VLLM

'gemma_2b_en-vllm'

In [26]:
endpoint


<google.cloud.aiplatform.models.Endpoint object at 0x7f8d1405c340> 
resource name: projects/645326684685/locations/us-central1/endpoints/4383332846501101568

In [28]:
def test_vertexai_endpoint(endpoint: aiplatform.Endpoint):
    for question, prompt in zip(TEST_EXAMPLES, TEST_PROMPTS):
        instance = {
            "prompt": prompt,
            "max_tokens": 256,
            "temperature": 1.0,
            "top_p": 1.0,
            "top_k": 3,
            "raw_response": True,
        }
        print(instance)
        response = endpoint.predict(instances=[instance])
        output = response.predictions[0]
        #print(output)
        print(f"{question}\n{output}\n{'- '*40}")


test_vertexai_endpoint(endpoint)

{'prompt': 'Sender:\n😂 bro whats your salary?\n\nAndres Perez:\n', 'max_tokens': 256, 'temperature': 1.0, 'top_p': 1.0, 'top_k': 3, 'raw_response': True}
😂 bro whats your salary?
3.000€ 
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
{'prompt': 'Sender:\nAre you home?\n\nAndres Perez:\n', 'max_tokens': 256, 'temperature': 1.0, 'top_p': 1.0, 'top_k': 3, 'raw_response': True}
Are you home?
Yeah, why do you ask?
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
{'prompt': 'Sender:\nDid you manage to finish?\n\nAndres Perez:\n', 'max_tokens': 256, 'temperature': 1.0, 'top_p': 1.0, 'top_k': 3, 'raw_response': True}
Did you manage to finish?
Yeah, it was good, I was a bit tired, but I was able to do it
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
{'prompt': 'Sender:\nDo you like your hiit classes?\n\nAndres Perez:\n', 'max_tokens': 256, 'temperature': 1.0, 'top_p': 1.0, 'top_k': 3, 'raw_respo