# Model deployment with Amazon SageMaker AI

In [None]:
import os
import time
import boto3
import json
import sagemaker
from sagemaker.session import Session
from sagemaker.estimator import Estimator
from nb_local_utils.helpers import (
    pretty_print_html,
)

In [None]:
sagemaker_session = Session()
sagemaker_session_bucket = None

if sagemaker_session_bucket is None and sagemaker_session is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sagemaker_session.default_bucket()

try:
    role = sagemaker.session.get_execution_role()
except ValueError:
    iam = boto3.client("iam")
    role = iam.get_role(RoleName="sagemaker_execution_role")["Role"]["Arn"]

sagemaker_session = Session(default_bucket=sagemaker_session_bucket)
bucket_name = sagemaker_session.default_bucket()
default_prefix = sagemaker_session.default_bucket_prefix

sm_client = boto3.client("sagemaker", region_name=sagemaker_session.boto_region_name)
sts = boto3.client("sts", region_name=sagemaker_session.boto_region_name)

In [None]:
pretty_print_html(f"sagemaker role arn: {role}")

In [None]:
pretty_print_html(f"sagemaker bucket: {sagemaker_session_bucket}")

In [None]:
pretty_print_html(f"sagemaker session region: {sagemaker_session.boto_region_name}")

In [None]:
%store -r full_training_job_name
pretty_print_html(f"Your training job name: {full_training_job_name}")

In [None]:
estimator_state = Estimator.attach(
    training_job_name=full_training_job_name
)

In [None]:
s3_model_data_uri = estimator_state.model_data
pretty_print_html(f"Fine-tuned model location: {s3_model_data_uri}")

## Prerequisites

In [None]:
MODEL_S3_URI = os.path.join(s3_model_data_uri["S3DataSource"]["S3Uri"], "Qwen/Qwen3-VL-2B-Instruct/full_model/")

model_id = "Qwen/Qwen3-VL-2B-Instruct"

model_name = f"{model_id.split("/")[-1].replace(".", "-")}-vllm"
endpoint_config_name = f"{model_id.split("/")[-1].replace(".", "-")}-config"
endpoint_name = f"{model_id.split('/')[-1].replace('.', '-')}"
ic_name = f"custom-{model_id.split('/')[-1].replace('.', '-')}-vllm"

In [None]:
pretty_print_html(f"- Model: {model_name}\n- Endpoint Config: {endpoint_config_name}\n- Endpoint: {endpoint_name}\n- Inference Component: {ic_name}")

### Utility functions

Utility functions to check the creation status of endpoints and inference components

In [None]:
import time

In [None]:
def wait_for_endpoint(sm_client, endpoint_name):
    while True:
        status = sm_client.describe_endpoint(EndpointName=endpoint_name)[
            "EndpointStatus"
        ]
        print(f"Endpoint status: {status}")
        if status in ["InService", "Failed"]:
            return status
        time.sleep(30)


def wait_for_inference_component(sm_client, component_name):
    while True:
        status = sm_client.describe_inference_component(
            InferenceComponentName=component_name
        )["InferenceComponentStatus"]
        print(f"Inference component status: {status}")
        if status in ["InService", "Failed"]:
            return status
        time.sleep(30)

***

# Model Deployment

In the following sections, we are going to deploy the fine-tuned model on an Amazon SageMaker Real-time endpoint.

#### Delete Existing SageMaker Endpoint

Let's delete a pre-existing `tuned-Qwen3-VL-2B-Instruct-*` ARNs that was predeployed for this workshop. You need to delete,
1. Endpoint (Deployed endpoint consuming a `ml.g5.2xlarge`)
2. Endpoint Config (Endpoint configuration ARN associated with deployed endpoint)
3. Model (Model ARN associated with deployed endpoint)

> [!WARNING]
> Please dont change the values in the cells below

In [None]:
old_tuned_endpoint_name="tuned-Qwen3-VL-2B-Instruct-ep"
old_tuned_endpoint_config_name="tuned-Qwen3-VL-2B-Instruct-ep-config"
old_tuned_model_name="tuned-Qwen3-VL-2B-Instruct"

In [None]:
try:
    sm_client.delete_endpoint(EndpointName=old_tuned_endpoint_name)
    time.sleep(2)
    pretty_print_html(f"Deleted EndpointName: {old_tuned_endpoint_name}")
except Exception as e:
    print(f"Issue deleting EndpointName, its likely deleted: {old_tuned_endpoint_name}")

In [None]:
try:
    sm_client.delete_endpoint_config(EndpointConfigName=old_tuned_endpoint_config_name)
    time.sleep(2)
    pretty_print_html(f"Deleted EndpointConfigName: {old_tuned_endpoint_config_name}")
except Exception as e:
    print(f"Issue deleting EndpointConfigName, its likely deleted: {old_tuned_endpoint_config_name}")

In [None]:

try:
    sm_client.delete_model(ModelName=old_tuned_model_name)
    time.sleep(2)
    pretty_print_html(f"Deleted ModelName: {old_tuned_model_name}")
except Exception as e:
    print(f"Issue deleting ModelName, its likely deleted: {old_tuned_model_name}")

#### Waiter for Instance Clean up - **[Mandatory]**

In [None]:
print(f"Sleeping for a few seconds for instance free up...")
time.sleep(5)

#### Inference configurations

In [None]:
instance_count = 1
instance_type = "ml.g5.2xlarge"
number_of_gpu = 1
health_check_timeout = 700

#### Create the Endpoint configuration

An Endpoint Configuration in SageMaker defines which model(s), instance type, and scaling settings an endpoint should use when it is created or deployed.

In [None]:
endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ExecutionRoleArn=role,
    ProductionVariants=[
        {
            "VariantName": "AllTraffic",
            "InstanceType": instance_type,
            "InitialInstanceCount": instance_count,
            "ModelDataDownloadTimeoutInSeconds": health_check_timeout,
            "RoutingConfig": {"RoutingStrategy": "LEAST_OUTSTANDING_REQUESTS"},
        }
    ],
)

pretty_print_html(json.dumps(endpoint_config_response, indent=2))

#### Create endpoint

A SageMaker Endpoint is a fully managed, always-on HTTPS API that hosts your deployed model and serves real-time inference requests.

In [None]:
endpoint_response = sm_client.create_endpoint(
    EndpointName=endpoint_name, EndpointConfigName=endpoint_config_name
)

pretty_print_html(json.dumps(endpoint_response, indent=2))

Let's wait for the creation

In [None]:
wait_for_endpoint(sm_client, endpoint_name)

#### Create the Model

In [None]:
account_id = sts.get_caller_identity()["Account"]
region = sagemaker_session.boto_session.region_name
repo_name = "qwen3-vl-vllm"
tag = "latest"

image_uri = f"{account_id}.dkr.ecr.{region}.amazonaws.com/{repo_name}:{tag}"

pretty_print_html(f"Using inference image URI: {image_uri}")

In [None]:
env = {
    "OPTION_MODEL": "/opt/ml/model/",
    "OPTION_SERVED_MODEL_NAME": "model",
    "OPTION_TENSOR_PARALLEL_SIZE": json.dumps(number_of_gpu),
    "OPTION_DTYPE": "bfloat16",
    "OPTION_MAX_MODEL_LEN": json.dumps(1024 * 16),
    "OPTION_GPU_MEMORY_UTILIZATION": "0.85",
    "OPTION_LIMIT_MM_PER_PROMPT": json.dumps({"image": 5, "video": 0}),
    "OMP_NUM_THREADS": "1",
}

pretty_print_html(f"Model Environment: {json.dumps(env)}")

In [None]:
model_response = sm_client.create_model(
    ModelName=model_name,
    ExecutionRoleArn=role,
    PrimaryContainer={
        "Image": image_uri,
        "Environment": env,
        "ModelDataSource": {
            "S3DataSource": {
                "S3Uri": MODEL_S3_URI,
                "S3DataType": "S3Prefix",
                "CompressionType": "None",
            }
        },
    },
)
pretty_print_html(json.dumps(model_response, indent=2))

#### Create inference component

In [None]:
ic_response = sm_client.create_inference_component(
    InferenceComponentName=ic_name,
    EndpointName=endpoint_name,
    VariantName="AllTraffic",
    Specification={
        "ModelName": model_name,
        "ComputeResourceRequirements": {
            "MinMemoryRequiredInMb": 12288,
            "NumberOfAcceleratorDevicesRequired": 1,
        },
    },
    RuntimeConfig={"CopyCount": 1},
)

pretty_print_html(json.dumps(ic_response, indent=2))

Let's wait for the creation

In [None]:
wait_for_inference_component(sm_client, ic_name)

***

## Test endpoint

In [None]:
import io
import json
import boto3
import base64
from io import BytesIO

In [None]:
sagemaker_client = boto3.client(service_name="sagemaker-runtime")

### Iterator class for streaming inference

Utility class to parse streaming responses

In [None]:
class LineIterator:
    def __init__(self, stream):
        self.byte_iterator = iter(stream)
        self.buffer = io.BytesIO()
        self.read_pos = 0

    def __iter__(self):
        return self

    def __next__(self):
        while True:
            self.buffer.seek(self.read_pos)
            line = self.buffer.readline()

            if line and line[-1] == ord("\n"):
                self.read_pos += len(line)
                return line[:-1]

            try:
                chunk = next(self.byte_iterator)
            except StopIteration:
                if self.read_pos < self.buffer.getbuffer().nbytes:
                    continue
                raise

            if "PayloadPart" not in chunk:
                continue

            self.buffer.seek(0, io.SEEK_END)
            self.buffer.write(chunk["PayloadPart"]["Bytes"])

Utility function to parse model answer

In [None]:
def parse_streaming_response(line_str):
    """Parse a single streaming response line and return content if found."""
    if not line_str.strip() or line_str.strip() == "data: [DONE]":
        return None

    if line_str.startswith("data: "):
        line_str = line_str[6:]

    try:
        data = json.loads(line_str)
        if "choices" in data:
            for choice in data["choices"]:
                if "delta" in choice and "content" in choice["delta"]:
                    return choice["delta"]["content"]
    except json.JSONDecodeError:
        pass

    return None

Utility function to convert an image in base64

In [None]:
def pil_to_base64(pil_img, resize_perc=0.5):
    """Convert a PIL image to base64-encoded PNG string."""
    pil_img = pil_img.resize([int(resize_perc * s) for s in pil_img.size])
    buffer = BytesIO()
    pil_img.save(buffer, format="PNG")
    return base64.b64encode(buffer.getvalue()).decode("utf-8")

In [None]:
prompt = """
Describe the content of the image
"""

In [None]:
from PIL import Image

# Load the image
img = Image.open("./images/image_1.png")

img

In [None]:
image_base64 = pil_to_base64(img)

request_body = {
    "messages": [
        {
            "role": "user",
            "content": [
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/png;base64,{image_base64}"},
                },
                {"type": "text", "text": prompt},
            ],
        }
    ],
    "max_tokens": 4096,
    "temperature": 0.3,
    "top_p": 0.9,
    "stop": ["<|im_end|>"],
    "stream": True,
}

response = sagemaker_client.invoke_endpoint_with_response_stream(
    EndpointName=endpoint_name,
    InferenceComponentName=ic_name,
    Body=json.dumps(request_body),
    ContentType="application/json",
)

generated_text = ""

for line in LineIterator(response["Body"]):
    if line:
        content = parse_streaming_response(line.decode("utf-8"))
        if content:
            generated_text += content
            print(content, end="", flush=True)

***

## Delete resources (Uncomment to Delete)

In [None]:
# import boto3

# sm_client = boto3.client("sagemaker")

# model_id = "Qwen/Qwen3-VL-2B-Instruct"

# model_name = f"{model_id.split("/")[-1].replace(".", "-")}-vllm"
# endpoint_config_name = f"{model_id.split("/")[-1].replace(".", "-")}-config"
# endpoint_name = f"{model_id.split('/')[-1].replace('.', '-')}"
# ic_name = f"custom-{model_id.split('/')[-1].replace('.', '-')}-vllm"

In [None]:
# # Delete inference component
# sm_client.delete_inference_component(InferenceComponentName=ic_name)

In [None]:
# # Delete model
# sm_client.delete_model(ModelName=model_name)

In [None]:
# # Delete endpoint config (optional)
# sm_client.delete_endpoint_config(EndpointConfigName=endpoint_config_name)

In [None]:
# # Delete endpoint (optional - if you want to remove the endpoint too)
# sm_client.delete_endpoint(EndpointName=endpoint_name)

---
---
END OF LAB 4
--- 
---
---