In [None]:
%pip install sagemaker boto3 litellm aiohttp -qU

<div class="alert alert-block alert-info">
<center>⚠️ <b>Important:</b> Please restart the kernel after installing the dependencies. ⚠️</center>
</div>

## Deploy the model from SageMaker JumpStart on a SageMaker Inference endpoint

> Note: skip the cell below if you have already deployed your model.

In [None]:
from sagemaker.jumpstart.model import JumpStartModel
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer
from sagemaker.enums import EndpointType
from sagemaker.compute_resource_requirements.resource_requirements import ResourceRequirements


resources = ResourceRequirements(
    requests = {
        "num_accelerators": 4, # Number of accelerators required
        "memory": 96*1024,  # Minimum memory required in Mb (required)
        "copies": 1,
    }
)

model = JumpStartModel(
    model_id="huggingface-llm-mistral-small-24B-Instruct-2501", model_version="2.0.1",
    instance_type="ml.g5.12xlarge"
)
predictor = model.deploy(
    accept_eula=True,
    initial_instance_count=1,
    instance_type="ml.g5.12xlarge",
    serializer=JSONSerializer(), deserializer=JSONDeserializer(),
    endpoint_type=EndpointType.INFERENCE_COMPONENT_BASED,
    resources=resources,
    managed_instance_scaling={
        "MinInstanceCount": 0,
        "MaxInstanceCount": 1
    }
)

In [None]:
endpoint_name = predictor.endpoint_name
component_name = predictor.component_name
print(f"Endpoint name: {endpoint_name}")
print(f"Inference component name: {component_name}")

**NOTE:** deployment will take 5~7 minutes.

## Test it

### Using the Predictor object from the SageMaker Python SDK

In [None]:
try: 
    predictor
except:
    import boto3
    from sagemaker.session import Session
    from sagemaker.predictor import Predictor
    from sagemaker.serializers import JSONSerializer
    from sagemaker.deserializers import JSONDeserializer
    
    endpoint_name = "YOUR-ENDPOINT-NAME-HERE"
    component_name = "YOUR-INFERENCE-COMPONENT-NAME-HERE"

    boto_session = boto3.session.Session(region_name=boto3.Session().region_name)
    session = Session(boto_session=boto_session)
    
    predictor = Predictor(
        sagemaker_session=session,
        endpoint_name=endpoint_name, component_name=component_name,
        serializer=JSONSerializer(), deserializer=JSONDeserializer()
    )

In [None]:
%%time
prompt = "What is the town of Bari, Italy, known for?"
payload = {
    "messages": [
        {
            "role": "user",
            "content": prompt
        }
    ],
    "max_tokens": 4*1024,
    "temperature": 0.1,
    "top_p": 0.9,
}

response = predictor.predict(payload)
print(response['choices'][0]['message']['content'])

### Using Boto3

In [None]:
%%time
import boto3
import json

payload = {
    "inputs": "What is the town of Bari, Italy, known for? Provide a short answer.",
    "parameters": {
        "max_new_tokens": 4*1024,
        "top_p": 0.9,
        "temperature": 0.2,
    }
}

runtime = boto3.client('sagemaker-runtime', region_name=boto3.Session().region_name)
response = runtime.invoke_endpoint(
    EndpointName=endpoint_name,
    InferenceComponentName=component_name or None,
    ContentType='application/json',
    Body=json.dumps(payload)
)

result = json.loads(response['Body'].read().decode())
print(result['generated_text'])

### Using Boto3 and the Messages API (for compatible models only)

In [None]:
%%time
payload = {
    "messages": [
        {"role": "system", "content": "You are a helpful and honest assistant."},
        {"role": "user", "content": "What is the town of Bari, Italy, known for? Provide a short answer."}
    ],
    "max_tokens": 4*1024,
    "parameters": {
        "top_p": 0.9,
        "temperature": 0.6,
    }
}

response = runtime.invoke_endpoint(
    EndpointName=endpoint_name,
    InferenceComponentName=component_name,
    ContentType='application/json',
    Body=json.dumps(payload)
)

result = json.loads(response['Body'].read().decode())
print(result['choices'][0]['message'])

## Using LiteLLM

In [None]:
from litellm import completion


response = completion(
    model=f"sagemaker/{endpoint_name}", 
    model_id=component_name,
    messages=[
        {"role": "system", "content": "You are a helpful and honest assistant."},
        {"role": "user", "content": "What is the town of Bari, Italy, known for? Provide a short answer."}
    ],
    temperature=0.2,
    max_tokens=1024
)
response.choices[0].message.content

<div class="alert alert-block alert-info">
⚠️ <b>Important:</b> as of LiteLLM v1.67.2, `sagemaker_chat` provider does not not correctly pass the inference component name, causing `HTTPStatusError: Client error '400 Bad Request'`. Please use `sagemaker` provider instead.
</div>