# Deploy LLaVA-v1.5-13B model on Amazon SageMaker

***This notebook works best with the `conda_python3` kernel on a `ml.t3.xlarge` machine***.

---

In this notebook we download the LLaVA-v1.5-13B and deploy it on SageMaker. We deploy this model on a `ml.g5.12xlarge` instance type. 

The downloaded model files are archived into a `model.tar.gz` file that is uploaded to an S3 bucket.

In [15]:
import sys
!{sys.executable} -m pip install -r requirements.txt

Collecting git+https://github.com/haotian-liu/LLaVA.git@v1.1.1 (from -r requirements.txt (line 2))
  Cloning https://github.com/haotian-liu/LLaVA.git (to revision v1.1.1) to /tmp/pip-req-build-3kf78r45
  Running command git clone --filter=blob:none --quiet https://github.com/haotian-liu/LLaVA.git /tmp/pip-req-build-3kf78r45
  Running command git checkout -q 1619889c712e347be1cb4f78ec66e7cf414ac1a6
  Resolved https://github.com/haotian-liu/LLaVA.git to commit 1619889c712e347be1cb4f78ec66e7cf414ac1a6
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting huggingface-hub==0.19.4 (from -r requirements.txt (line 3))
  Using cached huggingface_hub-0.19.4-py3-none-any.whl.metadata (14 kB)
Collecting sagemaker==2.199.0 (from -r requirements.txt (line 4))
  Using cached sagemaker-2.199.0-py2.py3-none-any.whl
Collecting py

In [16]:
import os
import time
import boto3
import sagemaker
import globals as g
from pathlib import Path
from sagemaker import image_uris
from utils import get_bucket_name
from sagemaker.s3 import S3Uploader
from sagemaker.utils import name_from_base
from huggingface_hub import snapshot_download

In [17]:
# global constants
!pygmentize globals.py

[33m"""[39;49;00m
[33mGlobal variables used throughout the code.[39;49;00m
[33m"""[39;49;00m[37m[39;49;00m
[34mimport[39;49;00m [04m[36mos[39;49;00m[37m[39;49;00m
[34mimport[39;49;00m [04m[36mboto3[39;49;00m[37m[39;49;00m
[34mimport[39;49;00m [04m[36msagemaker[39;49;00m[37m[39;49;00m
[37m[39;49;00m
[37m# model deployment[39;49;00m[37m[39;49;00m
HF_MODEL_ID: [36mstr[39;49;00m = [33m"[39;49;00m[33mliuhaotian/llava-v1.5-13b[39;49;00m[33m"[39;49;00m[37m[39;49;00m
LLM_ENGINE: [36mstr[39;49;00m = [33m"[39;49;00m[33mdeepspeed[39;49;00m[33m"[39;49;00m[37m[39;49;00m
[37m[39;49;00m
[37m# HF_TASK: str = "question-answering"[39;49;00m[37m[39;49;00m
[37m# TRANSFORMERS_VERSION: str = "4.28.1"[39;49;00m[37m[39;49;00m
[37m# PYTORCH_VERSION: str = "2.0.0"[39;49;00m[37m[39;49;00m
[37m# PYTHON_VERSION: str = "py310"[39;49;00m[37m[39;49;00m
[37m[39;49;00m
[37m# S3 bucket strucutre, we use the default sagemaker bucket in the curren

In [18]:
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
sm_client = sagemaker_session.sagemaker_client
s3_client = boto3.client('s3')


In [19]:
bucket_name: str = get_bucket_name(g.CFN_STACK_NAME)
s3_model_uri: str = os.path.join("s3://", bucket_name, g.BUCKET_PREFIX, os.path.basename(g.HF_MODEL_ID), g.S3_MODEL_PREFIX)
s3_model_code_uri: str = os.path.join("s3://", bucket_name, g.BUCKET_PREFIX, os.path.basename(g.HF_MODEL_ID), g.S3_MODEL_CODE_PREFIX, "llava-src.tar.gz")

In [20]:
local_model_path: str = os.path.join(os.path.dirname(os.getcwd()), os.path.basename(g.HF_MODEL_ID))
Path(local_model_path).mkdir(exist_ok=True)
allow_patterns = ["*.json", "*.pt", "*.bin", "*.txt", "*.model"]

model_download_path = snapshot_download(
    repo_id=g.HF_MODEL_ID,
    cache_dir=local_model_path,
    allow_patterns=allow_patterns,
)

Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]

In [21]:
%%time
# upload model to s3
S3Uploader.upload(local_path=model_download_path, desired_s3_uri=s3_model_uri)


CPU times: user 3min 37s, sys: 3min 45s, total: 7min 22s
Wall time: 7min 47s


's3://multimodal-bucket-563851014557/multimodal/llava-v1.5-13b/model'

In [22]:
!rm llava-src.tar.gz
!tar zcvf llava-src.tar.gz ../llava-src --exclude ".ipynb_checkpoints" --exclude "__pycache__"
!aws s3 cp llava-src.tar.gz {s3_model_code_uri}

tar: Removing leading `../' from member names
../llava-src/
../llava-src/model.py
../llava-src/requirements.txt
../llava-src/serving.properties
../llava-src/run_llava_local.py
upload: ./llava-src.tar.gz to s3://multimodal-bucket-563851014557/multimodal/llava-v1.5-13b/code/llava-src.tar.gz


In [23]:
framework_name = f"djl-{g.LLM_ENGINE}"

inference_image_uri = image_uris.retrieve(
    framework=framework_name, region=g.AWS_REGION, version="0.23.0"
)

print(f"Inference container uri: {inference_image_uri}")

Inference container uri: 763104351884.dkr.ecr.us-east-1.amazonaws.com/djl-inference:0.23.0-deepspeed0.9.5-cu118


### SageMaker endpoint

- Realtime Endpoint

In [24]:
model_name = name_from_base(f"llava-djl")
print(model_name)

create_model_response = sm_client.create_model(
    ModelName=model_name,
    ExecutionRoleArn=role,
    PrimaryContainer={"Image": inference_image_uri, "ModelDataUrl": s3_model_code_uri},
)
model_arn = create_model_response["ModelArn"]

print(f"Created Model: {model_arn}")

llava-djl-2024-02-08-22-47-21-804
Created Model: arn:aws:sagemaker:us-east-1:563851014557:model/llava-djl-2024-02-08-22-47-21-804


In [25]:
instance_type = "ml.g5.12xlarge"

endpoint_config_name = f"{model_name}-12xl-config"
endpoint_name = f"{model_name}-12xl-endpoint"

endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "VariantName": "variant1",
            "ModelName": model_name,
            "InstanceType": instance_type,
            "InitialInstanceCount": 1,
            "ContainerStartupHealthCheckTimeoutInSeconds": 600,
        },
    ]
)
print(endpoint_config_response)

{'EndpointConfigArn': 'arn:aws:sagemaker:us-east-1:563851014557:endpoint-config/llava-djl-2024-02-08-22-47-21-804-12xl-config', 'ResponseMetadata': {'RequestId': 'b09d0478-2152-4c5d-8842-3e583ac4a03d', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'b09d0478-2152-4c5d-8842-3e583ac4a03d', 'content-type': 'application/x-amz-json-1.1', 'content-length': '126', 'date': 'Thu, 08 Feb 2024 22:47:22 GMT'}, 'RetryAttempts': 0}}


In [26]:
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=f"{endpoint_name}", EndpointConfigName=endpoint_config_name
)
print(f"Created Endpoint: {create_endpoint_response['EndpointArn']}")

Created Endpoint: arn:aws:sagemaker:us-east-1:563851014557:endpoint/llava-djl-2024-02-08-22-47-21-804-12xl-endpoint


In [27]:
_ = Path(g.ENDPOINT_FILENAME).write_text(create_endpoint_response['EndpointArn'].split('/')[1])