In [None]:
REGION = "europe-west4"
PROJECT_ID = "huggingface-cloud"
REPOSITORY = "custom-inference-gpu"
IMAGE = "huggingface-pipeline-gpu"
TAG = "py310-cu12.3-torch-2.2.0-transformers-4.38.1"
BUCKET_NAME = "huggingface-cloud"
BUCKET_URI = f"gs://{BUCKET_NAME}/bart-large-mnli/model.tar.gz"

In [None]:
!git lfs install
!git clone https://huggingface.co/facebook/bart-large-mnli

In [None]:
!ls

In [None]:
!cd bart-large-mnli/ && tar zcvf model.tar.gz --exclude flax_model.msgpack --exclude pytorch_model.bin --exclude rust_model.ot * && mv model.tar.gz ../

In [None]:
!gcloud config set storage/parallel_composite_upload_enabled True
!gcloud storage cp model.tar.gz $BUCKET_URI

In [None]:
!gcloud storage ls --recursive gs://{BUCKET_NAME}

In [None]:
!mkdir huggingface_predictor_gpu

In [None]:
%%writefile huggingface_predictor_gpu/predictor.py
import os
import logging
import tarfile
from typing import Any, Dict

from transformers import pipeline

from google.cloud.aiplatform.prediction.predictor import Predictor
from google.cloud.aiplatform.utils import prediction_utils

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)


class HuggingFacePredictor(Predictor):
    def __init__(self) -> None:
        pass
    
    def load(self, artifacts_uri: str) -> None:
        """Loads the preprocessor and model artifacts."""
        logger.debug(f"Downloading artifacts from {artifacts_uri}")
        prediction_utils.download_model_artifacts(artifacts_uri)
        logger.debug("Artifacts successfully downloaded!")
        os.makedirs("./model", exist_ok=True)
        with tarfile.open("model.tar.gz", "r:gz") as tar:
            tar.extractall(path="./model")
        logger.debug(f"HF_TASK value is {os.getenv('HF_TASK')}")
        self._pipeline = pipeline(os.getenv("HF_TASK", None), model="./model", device_map="auto")
        logger.debug("`pipeline` successfully loaded!")
        logger.debug(f"`pipeline` is using device={self._pipeline.device}")

    def predict(self, instances: Dict[str, Any]) -> Dict[str, Any]:
        return self._pipeline(**instances)

In [None]:
%%writefile huggingface_predictor_gpu/requirements.txt
transformers==4.38.1
accelerate==0.27.0

---

In [None]:
import os
from google.cloud.aiplatform.prediction import LocalModel

from huggingface_predictor_gpu.predictor import HuggingFacePredictor

local_model = LocalModel.build_cpr_model(
    "huggingface_predictor_gpu",
    f"{REGION}-docker.pkg.dev/{PROJECT_ID}/{REPOSITORY}/{IMAGE}:{TAG}",
    predictor=HuggingFacePredictor,
    requirements_path="huggingface_predictor_gpu/requirements.txt",
    # base_image="--platform=linux/amd64 nvcr.io/nvidia/pytorch:23.11-py3 AS build",
    base_image="--platform=linux/amd64 alvarobartt/torch-gpu:py310-cu12.3-torch-2.2.0 AS build",
)

In [None]:
!gcloud auth configure-docker {REGION}-docker.pkg.dev --quiet

In [None]:
!gcloud artifacts repositories create custom-inference-gpu --repository-format=docker --location={REGION}

In [None]:
local_model.push_image()

In [None]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=REGION)

In [None]:
!gcloud auth login
!gcloud auth application-default login

In [None]:
model = aiplatform.Model.upload(
    display_name="bart-large-mnli",
    artifact_uri="gs://huggingface-cloud/bart-large-mnli",
    serving_container_image_uri=local_model.get_serving_container_spec().image_uri,
    serving_container_environment_variables={
        "HF_TASK": "zero-shot-classification",
    },
)

In [None]:
from contexttimer import Timer

with Timer() as timer:
    endpoint = model.deploy(
        machine_type="g2-standard-4",
        accelerator_type="NVIDIA_L4",
        accelerator_count=1,
    )
print(f"Time to deploy `{model.display_name}` into endpoint `{endpoint.resource_name}` was {timer.elapsed}s")

In [None]:
import json
from google.api import httpbody_pb2
from google.cloud import aiplatform_v1

prediction_client = aiplatform_v1.PredictionServiceClient(
    client_options={"api_endpoint": f"{REGION}-aiplatform.googleapis.com"}
)

data = {
    "sequences": "Football is a sport",
    "candidate_labels": ["soccer", "football", "basketball"],
}

json_data = json.dumps(data)

http_body = httpbody_pb2.HttpBody(
    data=json_data.encode("utf-8"),
    content_type="application/json",
)

request = aiplatform_v1.RawPredictRequest(
    endpoint=endpoint.resource_name,
    http_body=http_body,
)

response = prediction_client.raw_predict(request)
json.loads(response.data)

In [None]:
endpoint.delete(force=True)
model.delete()

In [None]:
!gcloud artifacts docker images delete --quiet --delete-tags {REGION}-docker.pkg.dev/{PROJECT_ID}/{REPOSITORY}/{IMAGE}
!gcloud storage rm -r $BUCKET_URI