<div style="background-color: #FFDDDD; border-left: 5px solid red; padding: 10px; color: black;">
    <strong>Kernel:</strong> Python 3 (ipykernel)
</div>

# 🚀 Deploy `Qwen/Qwen3-4B-Instruct-2507` on Amazon SageMaker

## Prerequisites

To start off, let's install some packages to help us through the notebooks. **Restart the kernel after packages have been installed.**

In [None]:
%pip install -r ./scripts/requirements.txt --upgrade

## This cell will restart the kernel. Click "OK".

In [None]:
from IPython import get_ipython
get_ipython().kernel.do_shutdown(True)

***

In [None]:
import os
import sagemaker
import boto3
import shutil
from sagemaker.config import load_sagemaker_config
import sys
sys.path.append(os.path.dirname(os.getcwd()))

from utilities.helpers import (
    pretty_print_html, 
    set_meta_llama_params,
    print_dialog,
    format_messages,
    write_eula
)

sagemaker_session = sagemaker.Session()
s3_client = boto3.client('s3')

region = sagemaker_session.boto_session.region_name
bucket_name = sagemaker_session.default_bucket()
default_prefix = sagemaker_session.default_bucket_prefix
configs = load_sagemaker_config()

session = sagemaker.Session()
role = sagemaker.get_execution_role()


print(f"Execution Role: {role}")
print(f"Default S3 Bucket: {bucket_name}")

## Deploy Model to SageMaker Hosting

### Step 1: Get SageMaker LMI Container to host DeepSeek

In [None]:
# commenting until LMI 0.33.0 available via SageMaker SDK
# inference_image_uri = sagemaker.image_uris.retrieve(
#     framework="djl-lmi", 
#     region=session.boto_session.region_name, 
#     version="0.33.0"
# )

inference_image_uri = f"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.33.0-lmi15.0.0-cu128"
pretty_print_html(f"using image to host: {inference_image_uri}")

### Step 2: Deploy model using `DJLModel`

In [None]:
model_id = "Qwen/Qwen3-4B-Instruct-2507"
model_id_filesafe = model_id.replace("/","_")

use_local_model = True #set to false for the training job to download from HF, otherwise True will download locally

In [None]:
from huggingface_hub import snapshot_download
from sagemaker.s3 import S3Uploader
import os
import subprocess

if use_local_model:

    model_local_location = f"../models/{model_id_filesafe}"
    print("Downloading model ", model_id)
    os.makedirs(model_local_location, exist_ok=True)
    snapshot_download(repo_id=model_id, local_dir=model_local_location)
    print(f"Model {model_id} downloaded under {model_local_location}")

    if default_prefix:
        model_s3_destination = f"s3://{bucket_name}/{default_prefix}/models/{model_id_filesafe}"
    else:
        model_s3_destination = f"s3://{bucket_name}/models/{model_id_filesafe}"
    
    print(f"Beginning Model Upload...")

    subprocess.run(['aws', 's3', 'cp', model_local_location, model_s3_destination, '--recursive', '--exclude', '.cache/*', '--exclude', '.gitattributes'])
    
    print(f"Model Uploaded to: \n {model_s3_destination}")

    os.environ["model_location"] = model_s3_destination
else:
    os.environ["model_location"] = model_id

In [None]:
inference_llm_config = {
    "HF_MODEL_ID": os.environ["model_location"],
    "OPTION_MAX_MODEL_LEN": "4096",
    "OPTION_GPU_MEMORY_UTILIZATION": "0.8",
    "OPTION_ENABLE_STREAMING": "false",
    "OPTION_ROLLING_BATCH": "vllm",
    "OPTION_MODEL_LOADING_TIMEOUT": "3600",
    "OPTION_PAGED_ATTENTION": "false",
    'OPTION_TRUST_REMOTE_CODE': 'true',
    'OPTION_DTYPE': 'bf16',
    'OPTION_QUANTIZE': 'fp8',
    'OPTION_TENSOR_PARALLEL_DEGREE': 'max',
    'OPTION_MAX_ROLLING_BATCH_SIZE': '32',
}

In [None]:
model_name = "Qwen3-4B-Instruct-2507"

lmi_model = sagemaker.Model(
    image_uri=inference_image_uri,
    env=inference_llm_config,
    role=role,
    name=model_name
)

In [None]:
from sagemaker.utils import name_from_base

endpoint_name = f"{model_name}-endpoint"
BASE_ENDPOINT_NAME = name_from_base(endpoint_name)

predictor = lmi_model.deploy(
    initial_instance_count=1, 
    instance_type="ml.g5.2xlarge",
    endpoint_name=BASE_ENDPOINT_NAME
)

In [None]:
SYSTEM_PROMPT = f"""You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning. 
Below is an instruction that describes a task, paired with an input that provides further context. 
Write a response that appropriately completes the request.
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response."""

USER_PROMPT = "A 3-week-old child has been diagnosed with late onset perinatal meningitis, and the CSF culture shows gram-positive bacilli. What characteristic of this bacterium can specifically differentiate it from other bacterial agents?"

messages = [
    {"role": "system", "content": SYSTEM_PROMPT},
    {"role": "user", "content": USER_PROMPT},
]

messages

In [None]:
predictor = sagemaker.Predictor(
    endpoint_name=BASE_ENDPOINT_NAME,
    sagemaker_session=sagemaker_session,
    serializer=sagemaker.serializers.JSONSerializer(),
    deserializer=sagemaker.deserializers.JSONDeserializer(),
)

response = predictor.predict({
	"messages": messages,
    "parameters": {
        "temperature": 0.2,
        "top_p": 0.9,
        "return_full_text": False,
        "max_new_tokens": 1024
    }
})

response["choices"][0]["message"]["content"]

### Store variables

Save the endpoint name for use later

In [None]:
%store BASE_ENDPOINT_NAME