# SageMaker Example

## 2. Build the container

demo codes are in `app/`
build and push the docker with following commands:

In [None]:
!bash build_and_push_sglang.sh

## 3. Deploy on SageMaker

define the model and deploy on SageMaker


### 3.1 Init SageMaker session

In [None]:
# !pip install boto3 sagemaker transformers
import re
import json
import os,dotenv
import boto3
import sagemaker
from sagemaker import Model


dotenv.load_dotenv()
print(os.environ)

boto_sess = boto3.Session(
    region_name='us-east-1'
)

sess = sagemaker.session.Session(boto_session=boto_sess)
# role = sagemaker.get_execution_role()
role = os.environ.get('role')

### 3.2 Prepare model file

#### Option 2: deploy vllm by model_id

In [None]:
!tar czvf model.tar.gz model_tar/

In [None]:


s3_code_prefix = f"sagemaker_endpoint/sglang/"
bucket = sess.default_bucket() 
code_artifact = sess.upload_data("model.tar.gz", bucket, s3_code_prefix)
print(f"S3 Code or Model tar ball uploaded to --- > {code_artifact}")

### 3.3 Deploy model

### vllm

In [None]:
import boto3
from sagemaker.model import Model
from sagemaker import get_execution_role
import time

# 初始化
# role = get_execution_role()  # 或指定具体 ARN
sm_client = boto3.client('sagemaker')
logs_client = boto3.client('logs')

# 配置
endpoint_name = 'qwen-vl-endpoint-test'
image_uri = "434444145045.dkr.ecr.us-east-1.amazonaws.com/sagemaker_endpoint/vllm:v0.12.0"

print(f"Using role: {role}")
print(f"Image URI: {image_uri}")

try:
    # 创建模型
    model = Model(
        image_uri=image_uri,
        role=role,
        env={
            'HF_MODEL_ID': 'Qwen/Qwen3-VL-2B-Thinking',
            'MAX_MODEL_LEN': '4096',
            'MAX_NUM_SEQS': '256',
            'TENSOR_PARALLEL_SIZE': '1',
            'ENABLE_PREFIX_CACHING': '1',
            'VLLM_ALLOW_LONG_MAX_MODEL_LEN': '1',
            'DTYPE': 'auto',
            # 添加调试环境变量
            'SAGEMAKER_CONTAINER_LOG_LEVEL': '20',  # INFO
            'SAGEMAKER_REGION': 'us-east-1'
        },
        name=f'qwen-vl-model-{int(time.time())}'
    )
    
    print("✅ Model object created")
    
    # 部署
    print("Starting deployment...")
    predictor = model.deploy(
        initial_instance_count=1,
        instance_type='ml.g5.2xlarge',
        endpoint_name=endpoint_name,
        container_startup_health_check_timeout=900,
        wait=False  # 不等待，手动检查
    )
    
    print(f"✅ Deployment initiated for endpoint: {endpoint_name}")
    
    # 监控部署状态
    print("\nMonitoring deployment status...")
    while True:
        response = sm_client.describe_endpoint(EndpointName=endpoint_name)
        status = response['EndpointStatus']
        print(f"Status: {status}")
        
        if status == 'InService':
            print("✅ Endpoint is in service!")
            break
        elif status == 'Failed':
            print(f"❌ Deployment failed!")
            print(f"Failure reason: {response.get('FailureReason', 'Unknown')}")
            break
        
        time.sleep(30)
    
    # 检查日志
    print("\nChecking CloudWatch logs...")
    log_group = f'/aws/sagemaker/Endpoints/{endpoint_name}'
    try:
        streams = logs_client.describe_log_streams(
            logGroupName=log_group,
            orderBy='LastEventTime',
            descending=True,
            limit=5
        )
        print(f"Found {len(streams['logStreams'])} log streams")
        
        for stream in streams['logStreams']:
            print(f"\n--- Log Stream: {stream['logStreamName']} ---")
            events = logs_client.get_log_events(
                logGroupName=log_group,
                logStreamName=stream['logStreamName'],
                limit=50
            )
            for event in events['events']:
                print(event['message'])
                
    except logs_client.exceptions.ResourceNotFoundException:
        print("⚠️  No CloudWatch logs found yet")
        print("This usually means the container failed to start")
        
except Exception as e:
    print(f"❌ Error during deployment: {e}")
    import traceback
    traceback.print_exc()


In [None]:
CONTAINER='434444145045.dkr.ecr.us-east-1.amazonaws.com/sagemaker_endpoint/vllm:v0.12.0'
model_name="Qwen/Qwen3-VL-2B-Thinking"
env={
    "HF_MODEL_ID": model_name,
    "DTYPE": "auto",
    "VLLM_ALLOW_LONG_MAX_MODEL_LEN":"1",
    "MAX_MODEL_LEN":"12288", 
    "ENABLE_PREFIX_CACHING": "1" ,
    "TENSOR_PARALLEL_SIZE": "1",
    "MAX_NUM_SEQS": '256',
    "ENFORCE_EAGER":  "0",
    }

model = Model(
    name=sagemaker.utils.name_from_base("sagemaker-vllm")+"-model",
    model_data=code_artifact,
    image_uri=CONTAINER,
    role=role,
    sagemaker_session=sess,
    env=env,
    
)



# 部署模型到endpoint
endpoint_name = sagemaker.utils.name_from_base("sagemaker-vllm")+"-endpoint"
print(f"endpoint_name: {endpoint_name}")
predictor = model.deploy(
    initial_instance_count=1,
    instance_type='ml.g5.2xlarge',
    endpoint_name=endpoint_name,
)

### SGLANG

In [None]:
from sagemaker.enums import EndpointType
from sagemaker.compute_resource_requirements.resource_requirements import ResourceRequirements
from sagemaker import Predictor
from sagemaker import Model


CONTAINER='434444145045.dkr.ecr.us-east-1.amazonaws.com/sagemaker_endpoint/sglang:v0.5.6.post1-cu129-amd64'
model_path = "s3://sagemaker-us-east-1-434444145045/Qwen2-5-3B-Instruct/032650faedac452e86f95f3f3b004342/finetuned_model/"
# model_id = 'Qwen/Qwen2-1.5B-Instruct'
env={
    # "HF_MODEL_ID": model_id,
    "S3_MODEL_PATH":model_path,
}

model_name = sagemaker.utils.name_from_base("sagemaker-sglang")+"-model"

model = Model(
    name=model_name,
    model_data=code_artifact,
    image_uri=CONTAINER,
    role=role,
    sagemaker_session=sess,
    env=env,
    predictor_cls = Predictor,
)


# 部署模型到endpoint
endpoint_name = sagemaker.utils.name_from_base("sagemaker-sglang")+"-endpoint"
print(f"endpoint_name: {endpoint_name}")
predictor = model.deploy(
    initial_instance_count=1,
    instance_type='ml.g5.2xlarge',
    endpoint_name=endpoint_name,
    model_name=model_name, 
)

## 4. Test

you can invoke your model with SageMaker SDK

### 4.1 Message api non-stream mode

In [None]:
runtime = boto3.client('runtime.sagemaker',region_name='us-east-1')
payload = {
    "messages": [
    {
        "role": "user",
        "content": "who are you"
    }
    ],
    "model":"qwen",
    "max_tokens": 1024,
    "stream": False
}
response = runtime.invoke_endpoint(
    EndpointName=endpoint_name,
    ContentType='application/json',
    Body=json.dumps(payload)
)

print(json.loads(response['Body'].read())["choices"][0]["message"]["content"])

### 4.2 Message api stream mode

In [None]:
payload = {
    "messages": [
    {
        "role": "user",
        "content": "Write a quick sort in python"
    }
    ],
    "model":"custome",
    "max_tokens": 4096,
    "stream": True
}

response = runtime.invoke_endpoint_with_response_stream(
    EndpointName=endpoint_name,
    ContentType='application/json',
    Body=json.dumps(payload)
)

buffer = ""
for t in response['Body']:
    buffer += t["PayloadPart"]["Bytes"].decode()
    last_idx = 0
    for match in re.finditer(r'^data:\s*(.+?)(\n\n)', buffer):
        try:
            data = json.loads(match.group(1).strip())
            last_idx = match.span()[1]
            print(data["choices"][0]["delta"]["content"], end="")
        except (json.JSONDecodeError, KeyError, IndexError) as e:
            pass
    buffer = buffer[last_idx:]