# Llava model preparation

Let's start by installing some dependencies

In [1]:
!pip install "sagemaker==2.207.1" "huggingface_hub==0.20.3" --upgrade --quiet

In [None]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it doesn't exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

We create a code folder which will contain our custom inference code, and the requirements.txt for additional dependencies

In [3]:
!mkdir code

Next, we create a requirements.txt file and add the bitsandbytes library to it. The bitsandbytes library is used to quantize the model to 4bit. This library is not available by default in the Hugging Face Inference DLC image.

In [None]:
%%writefile code/requirements.txt
bitsandbytes==0.42.0

To use custom inference code, we need to create an inference.py script. In our example, we are going to overwrite:
- the model_fn to load our llava model correctly 
- the predict_fn to process incoming requests

In the model_fn, we use the LlavaForConditionalGeneration class from transformers to load the model from the local directory (model_dir). We specify device_map="auto" in order to automatically place the model on the available GPUs/CPUs (see this [guide](https://huggingface.co/docs/accelerate/usage_guides/big_modeling) for details). We also enable 4-bit inference (see [this blog](https://huggingface.co/blog/4bit-transformers-bitsandbytes) for details).
In the predict_fn, we use the generate function from transformers to generate the text for a given text/image input

In [None]:
%%writefile code/inference.py
from transformers import AutoProcessor, LlavaForConditionalGeneration
from transformers import BitsAndBytesConfig
import torch
import requests
from PIL import Image

def model_fn(model_dir):
    
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16
    )
    
    processor = AutoProcessor.from_pretrained(model_dir)
    model = LlavaForConditionalGeneration.from_pretrained(model_dir, quantization_config=quantization_config, device_map="auto")

    return processor, model


def predict_fn(data, processor_and_model):
    
    processor, model = processor_and_model
    
    # get prompt & parameters
    prompt = data.pop("question", data)
    image_path = data.pop("image", data)
    max_new_tokens = data.pop("max_new_tokens", 200)
    
    image = Image.open(requests.get(image_path, stream=True).raw)
    
    inputs = processor(prompt, images=[image], padding=True, return_tensors="pt").to("cuda")
    
    output = model.generate(**inputs, max_new_tokens=max_new_tokens)
    generated_text = processor.batch_decode(output, skip_special_tokens=True)

    # create response
    return {"output": generated_text}

We now use the huggingface_hub SDK to easily download the llava-1.5-7b-hf model files from Hugging Face to a model folder

In [None]:
from distutils.dir_util import copy_tree
from pathlib import Path
from huggingface_hub import snapshot_download
import random

HF_MODEL_ID="llava-hf/llava-1.5-7b-hf"

# download snapshot
snapshot_dir = snapshot_download(repo_id=HF_MODEL_ID)

# create model dir
model_tar = Path(f"model-{random.getrandbits(16)}")
model_tar.mkdir(exist_ok=True)

# copy snapshot to model dir
copy_tree(snapshot_dir, str(model_tar))

We copy our custom files (inference.py and requirements.txt) to the model folder

In [None]:
from distutils.dir_util import copy_tree
# copy code/ to model dir
copy_tree("code/", str(model_tar.joinpath("code")))

We create an archive which includes all our files to run inference

In [None]:
import tarfile
import os

# helper to create the model.tar.gz
def compress(tar_dir=None,output_file="model.tar.gz"):
    parent_dir=os.getcwd()
    os.chdir(tar_dir)
    with tarfile.open(os.path.join(parent_dir, output_file), "w:gz") as tar:
        for item in os.listdir('.'):
          print(item)
          tar.add(item, arcname=item)
    os.chdir(parent_dir)

compress(str(model_tar))

Finally, we upload the archive to an Amazon Simple Storage Service bucket

In [None]:
from sagemaker.s3 import S3Uploader

# upload model.tar.gz to s3
s3_model_uri=S3Uploader.upload(local_path="model.tar.gz", desired_s3_uri=f"s3://{sess.default_bucket()}/llava-hf-15-7b-test1")

print(f"model uploaded to: {s3_model_uri}")
# Take note of the s3_model_uri value, this is what the construct will use to deploy the model
