# Deploy the Owlv2-base-patch16 for inference using Amazon SageMakerAI
**Recommended kernel(s):** This notebook can be run with any Amazon SageMaker Studio kernel.

In this notebook, you will learn how to deploy the Qwen3-VL-2B-Instruct model (HuggingFace model ID: [google/owlv2-base-patch16](https://huggingface.co/google/owlv2-base-patch16)) using Amazon SageMaker AI. 

Let's install or upgrade these dependencies using the following command:

In [None]:
%pip install -Uq huggingface==4.49 sagemaker transformers==4.57.0

### Setup

In [1]:
import os
import datetime
import sagemaker
import boto3
import logging
import json
import time
import shutil
import tarfile

import sagemaker
from sagemaker.huggingface import HuggingFaceModel
from sagemaker.session import Session
from sagemaker.s3 import S3Uploader

from huggingface_hub import snapshot_download

print(sagemaker.__version__)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
2.245.0


In [2]:
session = sagemaker.Session()
role = sagemaker.get_execution_role()

HUGGING_FACE_HUB_TOKEN = "<REPLACE WITH HUGGINGFACE TOKEN>"

instance_type = "ml.g5.4xlarge"
instance_count = 1

model_id = "google/owlv2-base-patch16"
model_id_filesafe = model_id.replace("/", "_").replace(".", "_")
endpoint_name = f"{model_id_filesafe.replace("_", "-")}-endpoint-{str(datetime.datetime.now().timestamp()).replace(".", "-")}"
print(endpoint_name)

base_name = model_id.split('/')[-1].replace('.', '-').lower()
model_lineage = model_id.split('/')[0]
base_name

bucket_name = session.default_bucket()
default_prefix = session.default_bucket_prefix or f"models/{model_id_filesafe}"
print(f"Saving model artifacts to {bucket_name}/{default_prefix}")

os.makedirs("code", exist_ok=True)

google-owlv2-base-patch16-endpoint-1762196279-931042
Saving model artifacts to sagemaker-us-east-1-329542461890/models/google_owlv2-base-patch16


## Local Model Test

In [3]:
# This code is adapted from https://huggingface.co/google/owlv2-base-patch16

import requests
from PIL import Image
import numpy as np
import torch
from transformers import AutoProcessor, Owlv2ForObjectDetection
from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD

processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16", token=HUGGING_FACE_HUB_TOKEN)
model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16")

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
texts = [["a photo of a cat", "a photo of a dog"]]
inputs = processor(text=texts, images=image, return_tensors="pt")

# forward pass
with torch.no_grad():
    outputs = model(**inputs)

# Note: boxes need to be visualized on the padded, unnormalized image
# hence we'll set the target image sizes (height, width) based on that

def get_preprocessed_image(pixel_values):
    pixel_values = pixel_values.squeeze().numpy()
    unnormalized_image = (pixel_values * np.array(OPENAI_CLIP_STD)[:, None, None]) + np.array(OPENAI_CLIP_MEAN)[:, None, None]
    unnormalized_image = (unnormalized_image * 255).astype(np.uint8)
    unnormalized_image = np.moveaxis(unnormalized_image, 0, -1)
    unnormalized_image = Image.fromarray(unnormalized_image)
    return unnormalized_image

unnormalized_image = get_preprocessed_image(inputs.pixel_values)

target_sizes = torch.Tensor([unnormalized_image.size[::-1]])
# Convert outputs (bounding boxes and class logits) to final bounding boxes and scores
results = processor.post_process_object_detection(
    outputs=outputs, threshold=0.2, target_sizes=target_sizes
)

i = 0  # Retrieve predictions for the first image for the corresponding text queries
text = texts[i]
boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]

for box, score, label in zip(boxes, scores, labels):
    box = [round(i, 2) for i in box.tolist()]
    print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")

2025-11-03 18:58:10.476777: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762196290.501584    1280 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762196290.511630    1280 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-11-03 18:58:10.691100: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fa

Detected a photo of a cat with confidence 0.505 at location [497.9, 32.9, 964.33, 518.96]
Detected a photo of a cat with confidence 0.418 at location [20.35, 80.7, 480.96, 526.21]




## Create SageMaker Model
Here we define the custom requirements and inference logic to be run by this model. We download the model assets from HuggingFace, zip them up and upload them to S3. We then deploy the model as a `HuggingFaceModel`.

In [8]:
%%writefile code/requirements.txt
transformers==4.57.0
torch
torchvision
torchaudio
pillow
requests

Writing code/requirements.txt


In [9]:
%%writefile code/inference.py
# This code comes from HuggingFace
# https://huggingface.co/google/owlv2-base-patch16

import logging
import requests
from PIL import Image
import numpy as np
import torch
from transformers import AutoProcessor, Owlv2ForObjectDetection
from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD

logger = logging.getLogger()
logger.setLevel(logging.INFO)

def model_fn(model_dir):

    model = Owlv2ForObjectDetection.from_pretrained(
        model_dir,
        device_map="auto"
    )
    
    processor = AutoProcessor.from_pretrained(
        model_dir,
        trust_remote_code=True
    )

    return {"processor": processor, "model": model}


def predict_fn(data, model_obj):
    processor = model_obj["processor"]
    model = model_obj["model"]
    
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    image = Image.open(requests.get(url, stream=True).raw)
    texts = [["a photo of a cat", "a photo of a dog"]]
    inputs = processor(text=texts, images=image, return_tensors="pt")
    
    # forward pass
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Note: boxes need to be visualized on the padded, unnormalized image
    # hence we'll set the target image sizes (height, width) based on that
    
    def get_preprocessed_image(pixel_values):
        pixel_values = pixel_values.squeeze().numpy()
        unnormalized_image = (pixel_values * np.array(OPENAI_CLIP_STD)[:, None, None]) + np.array(OPENAI_CLIP_MEAN)[:, None, None]
        unnormalized_image = (unnormalized_image * 255).astype(np.uint8)
        unnormalized_image = np.moveaxis(unnormalized_image, 0, -1)
        unnormalized_image = Image.fromarray(unnormalized_image)
        return unnormalized_image
    
    unnormalized_image = get_preprocessed_image(inputs.pixel_values)
    
    target_sizes = torch.Tensor([unnormalized_image.size[::-1]])
    # Convert outputs (bounding boxes and class logits) to final bounding boxes and scores
    results = processor.post_process_object_detection(
        outputs=outputs, threshold=0.2, target_sizes=target_sizes
    )
    
    i = 0  # Retrieve predictions for the first image for the corresponding text queries
    text = texts[i]
    boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
    
    for box, score, label in zip(boxes, scores, labels):
        box = [round(i, 2) for i in box.tolist()]
        print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")

Writing code/inference.py


In [10]:
def filter_function(tarinfo):
    """Filter function to exclude .cache files and directories"""
    if '.cache' in tarinfo.name or '.gitattributes' in tarinfo.name:
        return None
    return tarinfo

In [16]:
s3_client = boto3.client('s3')
key = f"{default_prefix}/model.tar.gz"
force_rebuild_tarball = False

if force_rebuild_tarball or not s3_client.head_object(Bucket=bucket_name, Key=key):
    try:
        model_path = snapshot_download(repo_id=model_id, local_dir="./model", token=HUGGING_FACE_HUB_TOKEN)
        print(f"Successfully downloaded to {model_path}")
    except Exception as e:
        print(f"Failed to download after retries: {str(e)}")
    
    print("Building gzipped tarball...")
    with tarfile.open("./model.tar.gz", "w:gz") as tar:
        tar.add(model_path, arcname=".", filter=filter_function)
        tar.add("./code", filter=filter_function)
    print("Successfully tarred the ball.")
    
    print(f"Uploading tarball to {bucket_name}/{default_prefix}...")
    s3_client.upload_file("./model.tar.gz", bucket_name, f"{default_prefix}/model.tar.gz")
    shutil.rmtree("./model")
    os.remove("./model.tar.gz")
    print("Successfully uploaded, working directory cleaned")

Fetching 11 files:   0%|          | 0/11 [00:00<?, ?it/s]

.gitattributes: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/620M [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

Successfully downloaded to /home/sagemaker-user/sagemaker-genai-hosting-examples/01-models/google/Owl/model
Building gzipped tarball...
Successfully tarred the ball.
Uploading tarball to sagemaker-us-east-1-329542461890/models/google_owlv2-base-patch16...
Successfully uploaded, working directory cleaned


## Deploy Model to SageMaker Endpoint

Now we'll deploy our model to a SageMaker endpoint for real-time inference. This is a significant step that:
1. Provisions the specified compute resources (G5 instance)
2. Deploys the model container
3. Sets up the endpoint for API access

### Deployment Configuration
- **Instance Count**: 1 instance for single-node deployment
- **Instance Type**: `ml.g5.4xlarge` for high-performance inference

> ⚠️ **Important**: 
> - Deployment can take up to 15 minutes
> - Monitor the CloudWatch logs for progress

In [18]:
# Hub Model configuration. https://huggingface.co/models
hub = {
	'HF_MODEL_ID': 'google/owlv2-base-patch16',
	'HF_TASK': 'zero-shot-object-detection',
    'HF_TOKEN': HUGGING_FACE_HUB_TOKEN
}

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
    model_data=f"s3://{bucket_name}/{default_prefix}/model.tar.gz",
	transformers_version='4.49.0',
	pytorch_version='2.6.0',
	py_version='py312',
	env=hub,
	role=role, 
    entry_point="inference.py",
    enable_network_isolation=False
)

# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
	initial_instance_count=1, # number of instances
	instance_type='ml.m5.xlarge' # ec2 instance type
)

-------!

In [27]:
# Using DJL Serving

image_uri = "763104351884.dkr.ecr.us-east-1.amazonaws.com/djl-inference:0.34.0-lmi16.0.0-cu128-v1.2"

env = {
    'HF_MODEL_ID': model_id,
    'OPTION_MODEL_NAME': model_id,
    'HF_TASK':'zero-shot-object-detection',
    'OPTION_TASK':'zero-shot-object-detection',
    'SM_NUM_GPUS': json.dumps(1),
    'OPTION_TRUST_REMOTE_CODE': 'true',
    'OPTION_MODEL_LOADING_TIMEOUT': '3600',
    "OPTION_ROLLING_BATCH": "disable",
    "OPTION_TENSOR_PARALLEL_DEGREE": "1",
    "OPTION_MAX_MODEL_LEN": "5000",
    "OPTION_TRUST_REMOTE_CODE": "true",
    "SERVING_FAIL_FAST": "true",
}

model = HuggingFaceModel(
    model_data=f"s3://{bucket_name}/{default_prefix}/model.tar.gz",
    image_uri=image_uri,
    env=env,
    role=role,
    entry_point="inference.py",
    enable_network_isolation=False
)

predictor = model.deploy(
    initial_instance_count=instance_count,
    instance_type=instance_type,
    endpoint_name=endpoint_name
)

--------------!

In [33]:
# predictor.predict(
#     data = {"inputs":"TEST INFERENCE"}
# )

# runtime_client = boto3.client('sagemaker-runtime')
    
# payload = {
#     "inputs": "TEST INFERENCE"
# }
# data = {
#    "inputs": "Camera - You are awarded a SiPix Digital Camera! call 09061221066 fromm landline. Delivery within 28 days."
# }
# predictor.predict(data=data)

# response = runtime_client.invoke_endpoint(
#         EndpointName=endpoint_name,
#         ContentType='application/json',
#         Body=json.dumps(payload)
#     )


from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

class DJLSerializer(JSONSerializer):
    def __init__(self, model_name):
        super().__init__()
        self.model_name = model_name
    
    def serialize(self, data):
        if isinstance(data, dict) and 'model_name' not in data:
            data['model_name'] = self.model_name
        return super().serialize(data)

# Set the serializer on your predictor
predictor.serializer = DJLSerializer(model_name=model_id)
predictor.deserializer = JSONDeserializer()



import base64
from PIL import Image
import io

# Load and encode your image
image_path = "./two_cats_on_pink_sheet.jpg"
with open(image_path, "rb") as f:
    image_bytes = f.read()
    image_base64 = base64.b64encode(image_bytes).decode('utf-8')

# Prepare the request
data = {
    "model_name": "google/owlv2-base-patch16",  # Required parameter
    "inputs": image_base64,
    "parameters": {
        "candidate_labels": ["person", "car", "dog", "cat"]  # Objects you want to detect
    }
}

response = predictor.predict(data=data)
print(response)


# Clean up

In [26]:
predictor.delete_endpoint(True)
model.delete_model()