In [None]:
%pip install sagemaker boto3 litellm aiohttp -qU

In [None]:
from IPython import get_ipython
get_ipython().kernel.do_shutdown(True)

## Deploy the model from SageMaker JumpStart on a SageMaker Inference endpoint

> Note: skip the cell below if you have already deployed your model.

In [None]:
from sagemaker.jumpstart.model import JumpStartModel
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer
from sagemaker.enums import EndpointType
from sagemaker.compute_resource_requirements.resource_requirements import ResourceRequirements


resources = ResourceRequirements(
    requests = {
        "num_accelerators": 4, # Number of accelerators required
        "memory": 96*1024,  # Minimum memory required in Mb (required)
        "copies": 1,
    }
)

model = JumpStartModel(
    model_id="huggingface-llm-mistral-small-24B-Instruct-2501", model_version="2.0.2",
    instance_type="ml.g5.12xlarge", image_uri="763104351884.dkr.ecr.us-east-1.amazonaws.com/djl-inference:0.33.0-lmi15.0.0-cu128-v1.1"
)
predictor = model.deploy(
    accept_eula=True,
    initial_instance_count=1,
    instance_type="ml.g5.12xlarge",
    serializer=JSONSerializer(), deserializer=JSONDeserializer(),
    endpoint_type=EndpointType.INFERENCE_COMPONENT_BASED,
    resources=resources,
)

In [None]:
endpoint_name = predictor.endpoint_name
component_name = predictor.component_name
print(f"Endpoint name: {endpoint_name}")
print(f"Inference component name: {component_name}")

<div class="alert alert-block alert-info">
⚠️ <b>Note:</b> deployment will take 5~7 minutes. Take note of the endpoint name and the inference component names, as they will be needed later.
</div>