# VLM VPC 部署 - SageMaker Endpoint (生产推荐）

本notebook在VPC中部署VLM端点，提供企业级安全性。

## VPC部署优势
- 网络隔离和安全控制
- 降低延迟，提升性能
- 满足合规要求

## 前置要求
- VPC、私有子网(至少2个AZ)、安全组
- VPC端点(S3、SageMaker等)
- ml.g6e实例配额

## 1: 环境准备

安装必要的依赖包并初始化SageMaker环境

In [None]:
# 检查是否安装了依赖
!pip list | grep -E "(boto3|sagemaker|huggingface_hub|transformers)"

In [None]:
# 按需安装缺失的的依赖，例如：
!pip install boto3 sagemaker -q

In [None]:
import os
import tarfile
import json
import base64
import boto3, sagemaker
from sagemaker import Model
from datetime import datetime

sess = sagemaker.Session()
role = sagemaker.get_execution_role()
region = sess._region_name
bucket = sess.default_bucket()

print(f"Region: {region}, Role: {role}")
print(f"Default bucket: {bucket}")

## 2: 参数配置

配置模型路径、实例类型和VPC资源参数 **(请修改VPC配置)**

In [None]:
# 配置参数 - 请修改为你的实际值
MODEL_S3_PATH = "s3://your-bucket/models/qwen2-5-vl-7b/"
INSTANCE_TYPE = "ml.g6e.2xlarge"
INITIAL_INSTANCE_COUNT = 2
MODEL_NAME = "qwen2-5-vl-7b"

# VPC配置 - 必须修改
VPC_SECURITY_GROUP_IDS = ['sg-xxxxxxxxx']  # 你的安全组ID
VPC_SUBNET_IDS = ['subnet-xxxxxxxx', 'subnet-yyyyyyyy']  # 你的私有子网ID

# 扩缩容配置
MIN_CAPACITY = 2
MAX_CAPACITY = 10
TARGET_INVOCATIONS_PER_INSTANCE = 60  # 每实例每分钟调用数目标值
TARGET_GPU_UTILIZATION = 70.0  # 目标GPU利用率
TARGET_GPU_MEM_UTILIZATION = 85.0  # 目标GPU显存利用率

print(f"模型: {MODEL_S3_PATH}")
print(f"VPC安全组: {VPC_SECURITY_GROUP_IDS}")
print(f"VPC子网: {VPC_SUBNET_IDS}")

## 3: VPC资源验证

检查安全组和子网配置是否正确

In [None]:
# 验证VPC资源
ec2_client = boto3.client('ec2', region_name=region)

try:
    # 检查安全组
    sg_response = ec2_client.describe_security_groups(GroupIds=VPC_SECURITY_GROUP_IDS)
    print(f"✅ 安全组验证通过: {len(sg_response['SecurityGroups'])}个")
    
    # 检查子网
    subnet_response = ec2_client.describe_subnets(SubnetIds=VPC_SUBNET_IDS)
    azs = {s['AvailabilityZone'] for s in subnet_response['Subnets']}
    print(f"✅ 子网验证通过: {len(azs)}个AZ - {azs}")
    
    if len(azs) < 2:
        print("⚠️  建议使用至少2个不同AZ的子网")
        
except Exception as e:
    print(f"❌ VPC资源验证失败: {e}")
    print("请检查VPC_SECURITY_GROUP_IDS和VPC_SUBNET_IDS配置")

## 4: 创建LMI配置

生成serving.properties配置文件并打包

In [None]:
# 创建LMI配置
!rm -rf lmi_config
os.makedirs("lmi_config", exist_ok=True)

serving_config = f"""engine=Python
option.model_id={MODEL_S3_PATH}
option.dtype=fp16
option.rolling_batch=vllm
option.tensor_parallel_degree=1
option.device_map=auto
option.max_model_len=4096
option.max_rolling_batch_size=32
option.use_v2_block_manager=true
option.enable_streaming=false
"""

with open("lmi_config/serving.properties", "w") as f:
    f.write(serving_config)

with tarfile.open("lmi_config.tar.gz", "w:gz") as tar:
    tar.add("lmi_config", arcname="lmi_config")

print("✅ LMI配置已创建")

## 5: VPC端点部署

部署端点到私有VPC网络

In [None]:
# VPC端点部署
image_uri = f"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.33.0-lmi15.0.0-cu128"
code_artifact = sess.upload_data("lmi_config.tar.gz", bucket, "large-model-lmi/code")

# 创建VPC配置
vpc_config = {
    'SecurityGroupIds': VPC_SECURITY_GROUP_IDS,
    'Subnets': VPC_SUBNET_IDS
}

# 创建并部署模型
model = Model(image_uri=image_uri, model_data=code_artifact, role=role, vpc_config=vpc_config)
endpoint_name = f"vlm-{MODEL_NAME}-vpc-{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}"

print(f"🚀 开始VPC端点部署: {endpoint_name}")
predictor = model.deploy(
    initial_instance_count=INITIAL_INSTANCE_COUNT,
    instance_type=INSTANCE_TYPE,
    endpoint_name=endpoint_name,
    wait=False
)

print(f"✅ VPC端点部署已启动，预计15-20分钟")
with open("endpoint_name.save", "w") as f:
    f.write(endpoint_name)

## 6: 等待端点就绪

监控部署状态直到服务可用

In [None]:
# 等待端点就绪
import time
sm_client = boto3.client('sagemaker', region_name=region)

def check_endpoint_status(name):
    try:
        return sm_client.describe_endpoint(EndpointName=name)['EndpointStatus']
    except:
        return "Unknown"

print(f"等待VPC端点就绪: {endpoint_name}")
start_time = time.time()

while True:
    status = check_endpoint_status(endpoint_name)
    elapsed = int(time.time() - start_time)
    
    print(f"\r⏱️  {elapsed//60:02d}:{elapsed%60:02d} - 状态: {status}", end="", flush=True)
    
    if status == "InService":
        print(f"\n\n✅ VPC端点已就绪! 耗时: {elapsed//60}分{elapsed%60}秒")
        break
    elif status in ["Failed", "OutOfService"]:
        print(f"\n\n❌ 部署失败: {status}")
        break
    
    time.sleep(30)
    if elapsed > 2400:  # 40分钟超时
        print(f"\n\n⚠️  部署超时")
        break

## 7: 配置弹性扩缩容

**仅在端点状态为 InService 时执行此步骤！**

In [None]:
# 创建Auto Scaling客户端
autoscaling_client = boto3.client('application-autoscaling', region_name=region)

# 注册扩缩容目标
print("注册扩缩容目标...")
try:
    autoscaling_client.register_scalable_target(
        ServiceNamespace='sagemaker',
        ResourceId=f'endpoint/{endpoint_name}/variant/AllTraffic',
        ScalableDimension='sagemaker:variant:DesiredInstanceCount',
        MinCapacity=MIN_CAPACITY,
        MaxCapacity=MAX_CAPACITY
    )
    print(f"✅ 扩缩容目标已注册: {MIN_CAPACITY}-{MAX_CAPACITY}个实例")
except Exception as e:
    print(f"❌ 注册扩缩容目标失败: {e}")

In [None]:
# 创建扩缩容策略
print("创建扩缩容策略...")
try:
    autoscaling_client.put_scaling_policy(
        PolicyName=f'vlm-scaling-policy-{MODEL_NAME}',
        ServiceNamespace='sagemaker',
        ResourceId=f'endpoint/{endpoint_name}/variant/AllTraffic',
        ScalableDimension='sagemaker:variant:DesiredInstanceCount',
        PolicyType='TargetTrackingScaling',
        TargetTrackingScalingPolicyConfiguration={
            'TargetValue': TARGET_INVOCATIONS_PER_INSTANCE,
            'PredefinedMetricSpecification': {
                'PredefinedMetricType': 'SageMakerVariantInvocationsPerInstance'
            },
            'ScaleOutCooldown': 180,  # 3分钟快速扩容
            'ScaleInCooldown': 300    # 5分钟谨慎缩容
        }
    )
    print(f"✅ 扩缩容策略已创建")
    print(f"   监控指标: 每实例调用数")
    print(f"   目标阈值: {TARGET_INVOCATIONS_PER_INSTANCE}次/实例/分钟")
    print(f"   扩容冷却: 3分钟")
    print(f"   缩容冷却: 5分钟")
except Exception as e:
    print(f"❌ 创建扩缩容策略失败: {e}")

## 8: 推理测试

验证VPC端点功能

In [None]:
# 推理测试
smr_client = boto3.client("sagemaker-runtime", region_name=region)

def call_vpc_endpoint(text_prompt, max_tokens=512):
    prompt = {
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": [{"type": "text", "text": text_prompt}]}
        ],
        "temperature": 0.0,
        "max_tokens": max_tokens
    }
    
    try:
        response = smr_client.invoke_endpoint(
            EndpointName=endpoint_name,
            ContentType="application/json",
            Body=json.dumps(prompt)
        )
        result = json.loads(response["Body"].read().decode("utf-8"))
        return result["choices"][0]["message"]["content"]
    except Exception as e:
        return f"调用失败: {e}"

# 测试VPC端点
print("测试VPC端点推理...")
test_prompt = "请简单介绍VPC的作用"
print(f"\n问题: {test_prompt}")
result = call_vpc_endpoint(test_prompt)
print(f"回答: {result}")

## 9: 部署总结

显示完整的部署配置信息

In [None]:
# 部署总结
print("🎉 VLM VPC生产级部署完成!")
print(f"\n📋 部署信息:")
print(f"   端点名称: {endpoint_name}")
print(f"   实例类型: {INSTANCE_TYPE}")
print(f"   实例数量: {INITIAL_INSTANCE_COUNT} (可扩展至{MAX_CAPACITY})")
print(f"   VPC安全组: {VPC_SECURITY_GROUP_IDS}")
print(f"   VPC子网: {VPC_SUBNET_IDS}")

print(f"\n🔒 VPC安全特性:")
print(f"   ✅ 网络隔离 - 私有网络部署")
print(f"   ✅ 安全组控制 - 精确访问控制")
print(f"   ✅ 多AZ部署 - 高可用性")

print(f"\n📊 监控建议:")
print(f"   - CloudWatch: GPUUtilization, ModelLatency")
print(f"   - VPC Flow Logs: 网络流量监控")
print(f"   - 成本监控: 考虑SageMaker Savings Plans")

print(f"\n🔗 控制台链接:")
print(f"   SageMaker: https://console.aws.amazon.com/sagemaker/home?region={region}#/endpoints/{endpoint_name}")
print(f"   VPC: https://console.aws.amazon.com/vpc/home?region={region}")

print(f"\n⚠️  生产提醒:")
print(f"   - 定期审查安全组规则")
print(f"   - 监控VPC端点使用量")
print(f"   - 配置适当的备份和恢复策略")