In [None]:
VANILLA_ENDPOINT_NAME = "vanilla-qwen3-endpoint"

# Deploy a SageMaker AI endpoint
from sagemaker.utils import name_from_base
from sagemaker.model import Model
from sagemaker import get_execution_role
import json
import boto3


# Get current region
session = boto3.session.Session()
region_name = session.region_name

# Model configurations
instance_count = 1  # Deploy on a single instance
instance_type = "ml.g5.xlarge"  # 1 GPU instance
image_uri = f"763104351884.dkr.ecr.{region_name}.amazonaws.com/djl-inference:0.34.0-lmi16.0.0-cu128-v1.2"  # Hard-coding the latest DJL LMI image

# Setup the model
model = Model(
    name=name_from_base("vanilla-qwen3-06b"),
    image_uri=image_uri,
    role=get_execution_role(),
    env={
        'HF_MODEL_ID': "Qwen/Qwen3-0.6B",
        "SERVING_FAIL_FAST": "true",
        "OPTION_ASYNC_MODE": "true",
        "OPTION_ROLLING_BATCH": "disable",
        "OPTION_TENSOR_PARALLEL_DEGREE": "1",
        'OPTION_MAX_MODEL_LEN': json.dumps(1024 * 12),
        'OPTION_MODEL_LOADING_TIMEOUT': '900',
        'SAGEMAKER_MODEL_SERVER_TIMEOUT': '900',
        "OPTION_ENTRYPOINT": "djl_python.lmi_vllm.vllm_async_service",
        'OPTION_ENABLE_AUTO_TOOL_CHOICE': 'true',
        'OPTION_ENABLE_REASONING': 'true',
        'OPTION_REASONING_PARSER': 'qwen3',
        'OPTION_TOOL_CALL_PARSER': 'hermes',
    },
)

# Deploy the model to an endpoint - NOTE: Takes 5~7 minutes
model.deploy(
    endpoint_name=VANILLA_ENDPOINT_NAME,
    initial_instance_count=instance_count,
    instance_type=instance_type
)