In [1]:
import boto3
import json
import time
from datetime import datetime

In [2]:
session = boto3.Session()

In [37]:
# Initialize clients
region = session.region_name
sagemaker_client = boto3.client('sagemaker', region_name=region)
runtime_client = boto3.client('sagemaker-runtime', region_name=region)
sts_client = boto3.client('sts', region_name=region)

In [38]:
timestamp = datetime.now().strftime('%Y%m%d-%H%M%S')
model_name = f'vllm-model-{timestamp}'
endpoint_config_name = f'vllm-endpoint-config-{timestamp}'
endpoint_name = f'vllm-endpoint-{timestamp}'
account_id = sts_client.get_caller_identity()['Account']

In [39]:
container_image = f'{account_id}.dkr.ecr.{region}.amazonaws.com/vllm:0.11.2-sagemaker-v1.2'

In [40]:
huggingface_model_id = 'meta-llama/Meta-Llama-3-8B-Instruct'
huggingface_token = 'hf_your_token_here'  # Replace with your actual token

In [41]:
instance_type = 'ml.g6.4xlarge'  # For 8B model
execution_role = f'arn:aws:iam::{account_id}:role/SageMakerExecutionRole'

In [42]:
print("Configuration:")
print(f"  Model Name: {model_name}")
print(f"  Endpoint Name: {endpoint_name}")
print(f"  HuggingFace Model: {huggingface_model_id}")
print(f"  Instance Type: {instance_type}")

Configuration:
  Model Name: vllm-model-20251126-204805
  Endpoint Name: vllm-endpoint-20251126-204805
  HuggingFace Model: meta-llama/Meta-Llama-3-8B-Instruct
  Instance Type: ml.g6.4xlarge


In [43]:
print(f"\nCreating SageMaker model: {model_name}")

create_model_response = sagemaker_client.create_model(
    ModelName=model_name,
    PrimaryContainer={
        'Image': container_image,
        'Environment': {
            'SM_VLLM_MODEL': huggingface_model_id, # indicate your hf model here 
            'HUGGING_FACE_HUB_TOKEN': huggingface_token,  # Required for Llama 3
            'SAGEMAKER_CONTAINER_LOG_LEVEL': 'INFO',
            # Optional vLLM configuration:
            'SM_VLLM_MAX_MODEL_LEN': '2048',
            # 'SM_VLLM_GPU_MEMORY_UTILIZATION': '0.9',
        }
    },
    ExecutionRoleArn=execution_role,
    # Uncomment if using public ECR and you have VPC configured:
    # VpcConfig={
    #     'SecurityGroupIds': ['sg-xxxxxxxxx'],  # Your security group
    #     'Subnets': ['subnet-xxxxxxxxx']        # Your subnet
    # }
)
print(f"‚úì Model created")
print(f"  Model ARN: {create_model_response['ModelArn']}")



Creating SageMaker model: vllm-model-20251126-204805
‚úì Model created
  Model ARN: arn:aws:sagemaker:us-west-2:875423407011:model/vllm-model-20251126-204805


In [44]:

# =============================================================================
# SECTION 5: Create Endpoint Configuration
# =============================================================================

print(f"\nCreating endpoint configuration: {endpoint_config_name}")

create_endpoint_config_response = sagemaker_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            'VariantName': 'AllTraffic',
            'ModelName': model_name,
            'InstanceType': instance_type,
            'InitialInstanceCount': 1,
            'InitialVariantWeight': 1.0,
        }
    ]
)

print(f"‚úì Endpoint configuration created")
print(f"  Config ARN: {create_endpoint_config_response['EndpointConfigArn']}")



Creating endpoint configuration: vllm-endpoint-config-20251126-204805
‚úì Endpoint configuration created
  Config ARN: arn:aws:sagemaker:us-west-2:875423407011:endpoint-config/vllm-endpoint-config-20251126-204805


In [45]:
# =============================================================================
# SECTION 6: Create Endpoint (This takes 5-10 minutes)
# =============================================================================

print(f"\nCreating endpoint: {endpoint_name}")
print("‚è±Ô∏è  This will take approximately 5-10 minutes...")
print(f"\nüí° Monitor progress: https://console.aws.amazon.com/sagemaker/home?region={region}#/endpoints/{endpoint_name}\n")

create_endpoint_response = sagemaker_client.create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=endpoint_config_name
)

print(f"‚úì Endpoint creation initiated")
print(f"  Endpoint ARN: {create_endpoint_response['EndpointArn']}")


Creating endpoint: vllm-endpoint-20251126-204805
‚è±Ô∏è  This will take approximately 5-10 minutes...

üí° Monitor progress: https://console.aws.amazon.com/sagemaker/home?region=us-west-2#/endpoints/vllm-endpoint-20251126-204805

‚úì Endpoint creation initiated
  Endpoint ARN: arn:aws:sagemaker:us-west-2:875423407011:endpoint/vllm-endpoint-20251126-204805


In [47]:
# =============================================================================
# SECTION 7: Wait for Endpoint to be Ready
# =============================================================================

print("\nWaiting for endpoint to be in service...")
print("(This may take 5-10 minutes - please be patient)\n")

waiter = sagemaker_client.get_waiter('endpoint_in_service')
waiter.wait(
    EndpointName=endpoint_name,
    WaiterConfig={
        'Delay': 20,  # Check every 20 seconds
        'MaxAttempts': 60  # Wait up to 20 minutes
    }
)


Waiting for endpoint to be in service...
(This may take 5-10 minutes - please be patient)



In [48]:
# =============================================================================
# SECTION 8: Make Inference Request
# =============================================================================

print("\n" + "="*50)
print("TESTING INFERENCE")
print("="*50 + "\n")

# Test prompt 1
prompt = "What is the capital of France?"
request_body = {
    "prompt": prompt,
    "max_tokens": 100,
    "temperature": 0.7,
    "top_p": 0.9
}

print(f"Prompt: {prompt}")

response = runtime_client.invoke_endpoint(
    EndpointName=endpoint_name,
    ContentType='application/json',
    Body=json.dumps(request_body)
)

response_body = json.loads(response['Body'].read().decode('utf-8'))
print(f"\nResponse:")
print(json.dumps(response_body, indent=2))


TESTING INFERENCE

Prompt: What is the capital of France?

Response:
{
  "id": "cmpl-4265764d4f5d47d6acb5dc3dd971934b",
  "object": "text_completion",
  "created": 1764193378,
  "model": "meta-llama/Meta-Llama-3-8B-Instruct",
  "choices": [
    {
      "index": 0,
      "text": " A) Paris B) Lyon C) Bordeaux D) Marseille\nThe correct answer is A) Paris. Paris is the capital and most populous city of France, located in the north-central part of the country. It is known for its iconic landmarks such as the Eiffel Tower, Notre-Dame Cathedral, and the Louvre Museum, as well as its fashion, cuisine, and cultural institutions. Lyon, Bordeaux, and Marseille are all major cities in France, but they are not the capital.",
      "logprobs": null,
      "finish_reason": "length",
      "stop_reason": null,
      "token_ids": null,
      "prompt_logprobs": null,
      "prompt_token_ids": null
    }
  ],
  "service_tier": null,
  "system_fingerprint": null,
  "usage": {
    "prompt_tokens": 8,
   

In [52]:
# =============================================================================
# SECTION 9: Test with Multiple Prompts (Concurrent)
# =============================================================================

print("\n" + "="*50)
print("TESTING MULTIPLE PROMPTS (CONCURRENT)")
print("="*50 + "\n")

from concurrent.futures import ThreadPoolExecutor, as_completed

def invoke_prompt(prompt, prompt_num):
    """Function to invoke endpoint with a prompt"""
    request_body = {
        "prompt": prompt,
        "max_tokens": 150,
        "temperature": 0.7
    }
    
    print(f"[{prompt_num}] Sending: {prompt[:50]}...")
    
    response = runtime_client.invoke_endpoint(
        EndpointName=endpoint_name,
        ContentType='application/json',
        Body=json.dumps(request_body)
    )
    
    response_body = json.loads(response['Body'].read().decode('utf-8'))
    print(f"[{prompt_num}] ‚úì Received response")
    
    return prompt_num, prompt, response_body

# Test prompts
test_prompts = [
    "Explain quantum computing in simple terms.",
    "Write a haiku about artificial intelligence.",
    "What are the benefits of using Python for data science?",
    "What is the capital of France?",
    "Tell me a joke about programming."
]

# Run prompts concurrently using ThreadPoolExecutor
start_time = time.time()
results = []

with ThreadPoolExecutor(max_workers=5) as executor:
    # Submit all tasks
    futures = {
        executor.submit(invoke_prompt, prompt, i+1): (i+1, prompt)
        for i, prompt in enumerate(test_prompts)
    }
    
    # Collect results as they complete
    for future in as_completed(futures):
        try:
            result = future.result()
            results.append(result)
        except Exception as e:
            prompt_num, prompt = futures[future]
            print(f"[{prompt_num}] ‚ùå Error: {e}")

elapsed = time.time() - start_time

# Sort results by prompt number and print
results.sort(key=lambda x: x[0])

print(f"\n{'='*50}")
print("RESULTS")
print(f"{'='*50}\n")

for prompt_num, prompt, response in results:
    print(f"[{prompt_num}] Prompt: {prompt}")
    print(f"    Response: {json.dumps(response, indent=4)}\n")

print(f"‚úì All {len(test_prompts)} prompts completed in {elapsed:.2f} seconds")


TESTING MULTIPLE PROMPTS (CONCURRENT)

[1] Sending: Explain quantum computing in simple terms....
[2] Sending: Write a haiku about artificial intelligence....
[3] Sending: What are the benefits of using Python for data sci...
[4] Sending: What is the capital of France?...
[5] Sending: Tell me a joke about programming....
[5] ‚úì Received response
[2] ‚úì Received response
[3] ‚úì Received response
[4] ‚úì Received response
[1] ‚úì Received response

RESULTS

[1] Prompt: Explain quantum computing in simple terms.
    Response: {
    "id": "cmpl-1b9520eca93844498b8400a9a586389c",
    "object": "text_completion",
    "created": 1764193948,
    "model": "meta-llama/Meta-Llama-3-8B-Instruct",
    "choices": [
        {
            "index": 0,
            "text": " (What is it? How does it work? Why is it important?)\nQuantum computing is a new way of processing information that uses the principles of quantum mechanics, which is the study of the behavior of matter and energy at the smallest

In [56]:
# =============================================================================
# SECTION 10: Test Streaming Response
# =============================================================================

print("\n" + "="*50)
print("TESTING STREAMING RESPONSE")
print("="*50 + "\n")

# Streaming request
stream_prompt = "Write a short story about a robot learning to paint."
request_body = {
    "prompt": stream_prompt,
    "max_tokens": 300,
    "temperature": 0.8,
    "stream": True  # Enable streaming
}

print(f"Prompt: {stream_prompt}")
print("\nStreaming response:\n")
print("-" * 50)

response = runtime_client.invoke_endpoint_with_response_stream(
    EndpointName=endpoint_name,
    ContentType='application/json',
    Body=json.dumps(request_body)
)

# Process the streaming response
event_stream = response['Body']
full_response = ""
buffer = ""  # Buffer for incomplete JSON

for event in event_stream:
    if 'PayloadPart' in event:
        chunk = event['PayloadPart']['Bytes'].decode('utf-8')
        buffer += chunk

        # Try parsing as JSON lines (vLLM format)
        lines = buffer.split('\n')

        # Keep the last incomplete line in buffer
        buffer = lines[-1]
        for line in lines[:-1]:
            if not line.strip():
                continue
            # Remove "data: " prefix if present (SSE format)
            if line.startswith('data: '):
                line = line[6:]
            if line.strip() == '[DONE]':
                continue
            try:
                chunk_data = json.loads(line)
                # vLLM uses OpenAI-compatible format
                if 'choices' in chunk_data and chunk_data['choices']:
                    text = chunk_data['choices'][0].get('text', '')
                    if text:
                        print(text, end='', flush=True)
                        full_response += text
            except json.JSONDecodeError:
                pass  # Skip incomplete JSON chunks

print("\n" + "-" * 50)
print(f"\n‚úì Streaming completed! Total length: {len(full_response)} characters")


TESTING STREAMING RESPONSE

Prompt: Write a short story about a robot learning to paint.

Streaming response:

--------------------------------------------------
 - Assignment Example
In this short story, we follow the journey of a robot named Zeta as it learns to paint. Zeta is a cutting-edge robot designed to perform various tasks, but it has never been programmed to create art. One day, its creator, a brilliant scientist named Dr. Rachel, decides to challenge Zeta by teaching it to paint.
Zeta is initially skeptical about the task, but Dr. Rachel is convinced that the robot's precision and attention to detail will make it a natural at painting. She begins by showing Zeta various brushstrokes and techniques, explaining the importance of color, texture, and composition. Zeta listens intently, its digital brain processing the information with lightning speed.
The first few attempts are... ...Show more
The robot's first attempts at painting are met with varying degrees of success. Zeta

In [57]:
# =============================================================================
# SECTION 11: Cleanup - Delete All Resources
# =============================================================================

print("\n" + "="*50)
print("CLEANUP: DELETING RESOURCES")
print("="*50)
print("\n‚ö†Ô∏è  This will delete the endpoint and stop charges\n")

# Delete endpoint
print(f"Deleting endpoint: {endpoint_name}")
sagemaker_client.delete_endpoint(EndpointName=endpoint_name)
print("‚úì Endpoint deletion initiated")

# Wait for deletion
print("Waiting for endpoint to be deleted...")
waiter = sagemaker_client.get_waiter('endpoint_deleted')
waiter.wait(EndpointName=endpoint_name)
print("‚úì Endpoint deleted")

# Delete endpoint configuration
print(f"\nDeleting endpoint configuration: {endpoint_config_name}")
sagemaker_client.delete_endpoint_config(EndpointConfigName=endpoint_config_name)
print("‚úì Endpoint configuration deleted")

# Delete model
print(f"\nDeleting model: {model_name}")
sagemaker_client.delete_model(ModelName=model_name)
print("‚úì Model deleted")

# Summary
print("\n" + "="*50)
print("CLEANUP COMPLETE")
print("="*50)
print("All resources deleted:")
print(f"  ‚úì Endpoint: {endpoint_name}")
print(f"  ‚úì Endpoint Config: {endpoint_config_name}")
print(f"  ‚úì Model: {model_name}")
print("\n‚úì No ongoing charges!")



CLEANUP: DELETING RESOURCES

‚ö†Ô∏è  This will delete the endpoint and stop charges

Deleting endpoint: vllm-endpoint-20251126-204805
‚úì Endpoint deletion initiated
Waiting for endpoint to be deleted...
‚úì Endpoint deleted

Deleting endpoint configuration: vllm-endpoint-config-20251126-204805
‚úì Endpoint configuration deleted

Deleting model: vllm-model-20251126-204805
‚úì Model deleted

CLEANUP COMPLETE
All resources deleted:
  ‚úì Endpoint: vllm-endpoint-20251126-204805
  ‚úì Endpoint Config: vllm-endpoint-config-20251126-204805
  ‚úì Model: vllm-model-20251126-204805

‚úì No ongoing charges!
