# Deploying qwen3-embedding-0-6b Embedding Model on Amazon SageMaker

This notebook demonstrates how to deploy the [qwen3-embedding-0-6b](https://huggingface.co/BAAI/qwen3-embedding-0-6b) embedding model on Amazon SageMaker. qwen3-embedding-0-6b is a state-of-the-art embedding model that supports dense, sparse, and ColBERT embeddings.

## Steps:
1. Download model checkpoint from Hugging Face
2. Upload model to S3
3. Create custom inference code
4. Deploy model to SageMaker endpoint
5. Test the endpoint

## 1. Download Model Checkpoint

In [None]:
from huggingface_hub import snapshot_download
from pathlib import Path

local_model_path = Path("./hf_model")
local_model_path.mkdir(exist_ok=True)
model_name = "Qwen/Qwen3-Embedding-0.6B"
snapshot_download(repo_id=model_name, cache_dir=local_model_path)

## 2. Upload Model to S3

In [None]:
import sagemaker
import boto3
import json

# Initialize SageMaker session and clients
role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
bucket = sess.default_bucket()  # bucket to house artifacts

region = sess._region_name
account_id = sess.account_id()

s3_client = boto3.client("s3")
sm_client = boto3.client("sagemaker")
smr_client = boto3.client("sagemaker-runtime")

# Define S3 paths
s3_model_prefix = "model/Qwen/Qwen3-Embedding-0.6B"  # folder where model checkpoint will go
model_snapshot_path = list(local_model_path.glob("**/snapshots/*"))[0]
s3_code_prefix = "inference_code/Qwen/Qwen3-Embedding-0.6B"
print(f"s3_code_prefix: {s3_code_prefix}")
print(f"model_snapshot_path: {model_snapshot_path}")

# Upload model to S3
!aws s3 cp --recursive {model_snapshot_path} s3://{bucket}/{s3_model_prefix}

## 3. Create Custom Inference Code

In [None]:
!mkdir -p inference_code

In [None]:
%%writefile inference_code/model.py
from djl_python import Input, from djl_python import Input, Output
import logging
from vllm import LLM, PoolingParams


def get_detailed_instruct(task_description: str, query: str) -> str:
    """Formats the query with a task-specific instruction."""
    if not task_description:
        task_description = (
            "Given a web search query, retrieve relevant passages that answer the query"
        )
    return f"Instruct: {task_description}\nQuery:{query}"


def load_model(properties):
    model_location = properties.get("model_dir", "/opt/ml/model")

    if "model_id" in properties:
        model_location = properties["model_id"]

    logging.info(f"Loading model from {model_location}")

    # Following the new example:
    # - task="embed" to use the embedding endpoint.
    # - hf_overrides for matryoshka embeddings.
    model = LLM(
        model=model_location,
        task="embed",
        hf_overrides={"is_matryoshka": True},
    )
    return model


model = None


def handle(inputs: Input):
    global model
    if model is None:
        model = load_model(inputs.get_properties())

    if inputs.is_empty():
        return None

    data = inputs.get_as_json()

    input_sentences = data.get("inputs", [])
    if isinstance(input_sentences, str):
        input_sentences = [input_sentences]

    # Parameters from the user's request, based on the new example
    is_query = data.get("is_query", False)
    instruction = data.get(
        "instruction"
    )  # Can be None, get_detailed_instruct will use a default.
    dim = data.get("dim", -1)  # For matryoshka embeddings

    logging.info(f"inputs: {len(input_sentences)} sentences")
    logging.info(f"is_query: {is_query}")
    if instruction:
        logging.info(f"custom instruction: {instruction}")
    logging.info(f"embedding dimension: {dim if dim > 0 else 'default'}")

    if is_query:
        # For queries, add instructions.
        input_texts = [get_detailed_instruct(instruction, q) for q in input_sentences]
    else:
        # For documents, no instruction is needed.
        input_texts = input_sentences

    pooling_params = None
    if dim > 0:
        logging.info(f"Using matryoshka embeddings with dimension: {dim}")
        pooling_params = PoolingParams(dimensions=dim)

    # Get embeddings using model.embed
    logging.info("Calling model.embed on vLLM...")
    outputs = model.embed(input_texts, pooling_params=pooling_params)
    logging.info("model.embed call finished.")

    # Extract embeddings from vLLM output
    embeddings = [o.outputs.embedding for o in outputs]
    logging.info(f"Extracted {len(embeddings)} embeddings.")

    # Format output
    result = {"dense_embeddings": embeddings}
    logging.info("Formatted result into a dictionary.")

    output_obj = Output().add_as_json(result)
    logging.info("Created DJL Output object. Returning from handle function.")

    return output_obj



In [None]:
import os

if not os.path.exists("inference_code"):
    os.mkdir("inference_code")

# Create serving.properties file
with open('inference_code/serving.properties', 'w') as f:
    f.write("engine=Python")
    f.write("\n")
    f.write("option.rolling_batch=disable")
    f.write("\n")
    f.write(f"option.model_id=s3://{bucket}/{s3_model_prefix}/")

In [None]:
%%writefile inference_code/requirements.txt
vllm

In [None]:
# Package and upload inference code
!rm -f inference_code.tar.gz
!cd inference_code && rm -rf ".ipynb_checkpoints"
!tar czvf inference_code.tar.gz inference_code

s3_code_artifact = sess.upload_data("inference_code.tar.gz", bucket, s3_code_prefix)
print(f"S3 Code or Model tar ball uploaded to --- > {s3_code_artifact}")

## 4. Deploy Model to SageMaker Endpoint

In [None]:
from sagemaker.utils import name_from_base
import boto3

# Define the DJL inference container URI
inference_image_uri = (f"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.33.0-lmi15.0.0-cu128-v1.3")
model_name = name_from_base("qwen3-embedding-0-6b")

print(f"Model name: {model_name}")
print(f"Inference container image: {inference_image_uri}")

In [None]:
# CREATE MODEL
create_model_response = sm_client.create_model(
    ModelName=model_name,
    ExecutionRoleArn=role,
    PrimaryContainer={
        "Image": inference_image_uri,
        "ModelDataUrl": s3_code_artifact,
    },
)
model_arn = create_model_response["ModelArn"]
print(f"Created Model: {model_arn}")


# CREATE ENDPOINT CONFIG
endpoint_config_name = f"{model_name}-config"
endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "VariantName": "variant1",
            "ModelName": model_name,
            "InstanceType": "ml.g5.xlarge",
            "InitialInstanceCount": 1,
            "ContainerStartupHealthCheckTimeoutInSeconds": 5*60,
        },
    ],
)
print(endpoint_config_response)

# CREATE ENDPOINT
endpoint_name = f"{model_name}-endpoint"
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=f"{endpoint_name}", EndpointConfigName=endpoint_config_name
)
print(f"Created Endpoint: {create_endpoint_response['EndpointArn']}")

In [None]:
# Wait for endpoint deployment to complete
import time

resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

## 5. Test the Endpoint

In [None]:
import boto3
import json
import numpy as np

smr_client = boto3.client("sagemaker-runtime")
# Use the endpoint name from your CDK output
endpoint_name = "qwen3-embedding-0-6b-endpoint" 

def get_embeddings(texts, is_query=False, dim=-1):
    """
    Invokes the SageMaker endpoint to get embeddings.
    - For queries, set is_query=True.
    - For documents, is_query can be False or omitted.
    - To get Matryoshka embeddings, specify the desired dimension with `dim`.
    """
    payload = {
        "inputs": texts,
        "is_query": is_query
    }
    if dim > 0:
        payload["dim"] = dim
    
    response = smr_client.invoke_endpoint(
        EndpointName=endpoint_name,
        Body=json.dumps(payload),
        ContentType="application/json",
    )
    # The model returns a JSON object with the key "dense_embeddings"
    json_str = response['Body'].read().decode('utf8')
    response_json = json.loads(json_str)
    return response_json


In [None]:
# --- Example Usage (Default Dimension) ---

# 1. Define queries and documents
queries = [
    "What is the capital of China?",
    "Explain gravity",
]
documents = [
    "The capital of China is Beijing.",
    "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.",
]

print("--- Default Dimension Example ---")
print("1. Embedding queries...")
query_response = get_embeddings(queries, is_query=True)
query_embeddings = query_response['dense_embeddings']

print("2. Embedding documents...")
doc_response = get_embeddings(documents)
doc_embeddings = doc_response['dense_embeddings']

# 4. Calculate similarity scores
scores = np.dot(np.array(query_embeddings), np.array(doc_embeddings).T)

print("\n--- Results (Default Dimension) ---")
print("Query embeddings shape:", np.array(query_embeddings).shape)
print("Document embeddings shape:", np.array(doc_embeddings).shape)
print("Similarity scores (dot product):")
print(scores.tolist())


In [None]:
# --- Matryoshka Embedding Example (dim=1024) ---
print("\n--- Matryoshka Embedding Example (dim=1024) ---")
dim = 1024

print(f"1. Embedding queries with dimension {dim}...")
query_response_matryoshka = get_embeddings(queries, is_query=True, dim=dim)
query_embeddings_matryoshka = query_response_matryoshka['dense_embeddings']

print(f"2. Embedding documents with dimension {dim}...")
doc_response_matryoshka = get_embeddings(documents, dim=dim)
doc_embeddings_matryoshka = doc_response_matryoshka['dense_embeddings']


In [None]:
# Calculate similarity scores
scores_matryoshka = np.dot(np.array(query_embeddings_matryoshka), np.array(doc_embeddings_matryoshka).T)

print(f"\n--- Results (dim={dim}) ---")
print("Query embeddings shape:", np.array(query_embeddings_matryoshka).shape)
print("Document embeddings shape:", np.array(doc_embeddings_matryoshka).shape)
print("Similarity scores (dot product):")
print(scores_matryoshka.tolist())

print("\nMost similar document for each query (using default dimension embeddings):")
for i, query in enumerate(queries):
    most_similar_doc_index = np.argmax(scores[i])
    print(f"Query: '{query}'")
    print(f"Most similar document: '{documents[most_similar_doc_index]}'")
    print(f"Score: {scores[i][most_similar_doc_index]:.4f}\n")