In [None]:
%pip install -Uq sagemaker==2.239.0
%pip install -Uq boto3==1.38.33
%pip install -Uq litellm==1.72.2
%pip install -Uq aiohttp==3.12.11

In [None]:
from IPython import get_ipython
get_ipython().kernel.do_shutdown(True)

## Inference with Amazon SageMaker AI

In [None]:
%store -r SAGEMAKER_ENDPOINT_NAME
print(f"Endpoint name: {SAGEMAKER_ENDPOINT_NAME}")

In [None]:
import boto3
from sagemaker.session import Session
from sagemaker.predictor import Predictor
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

boto_session = boto3.session.Session(region_name=boto3.Session().region_name)
session = Session(boto_session=boto_session)

predictor = Predictor(
    sagemaker_session=session,
    endpoint_name=SAGEMAKER_ENDPOINT_NAME,
    serializer=JSONSerializer(), deserializer=JSONDeserializer()
)

In [None]:
%%time
prompt = "What is the town of Bari, Italy, known for?"
payload = {
    "messages": [
        {
            "role": "user",
            "content": prompt
        }
    ],
    "max_tokens": 4*1024,
    "temperature": 0.1,
    "top_p": 0.9,
}

response = predictor.predict(payload)
print(response['choices'][0]['message']['content'])

### Using Boto3

In [None]:
%%time
import boto3
import json

payload = {
    "inputs": "What is the town of Bari, Italy, known for? Provide a short answer.",
    "parameters": {
        "max_tokens": 4*1024,
        "temperature": 0.1,
        "top_p": 0.9
    }
}

runtime = boto3.client('sagemaker-runtime', region_name=boto3.Session().region_name)
response = runtime.invoke_endpoint(
    EndpointName=SAGEMAKER_ENDPOINT_NAME,
    ContentType='application/json',
    Body=json.dumps(payload)
)

result = json.loads(response['Body'].read().decode())
print(result['generated_text'])

### Using Boto3 and the Messages API (for compatible models only)

In [None]:
%%time
payload = {
    "messages": [
        {"role": "system", "content": "You are a helpful and honest assistant."},
        {"role": "user", "content": "What is the town of Bari, Italy, known for? Provide a short answer."}
    ],
    "max_tokens": 4*1024,
    "temperature": 0.1,
    "top_p": 0.9
}

response = runtime.invoke_endpoint(
    EndpointName=SAGEMAKER_ENDPOINT_NAME,
    ContentType='application/json',
    Body=json.dumps(payload)
)

result = json.loads(response['Body'].read().decode())
print(result['choices'][0]['message'])

## Using LiteLLM

In [None]:
from litellm import completion


response = completion(
    model=f"sagemaker/{SAGEMAKER_ENDPOINT_NAME}", 
    #model_id=component_name,
    messages=[
        {"role": "system", "content": "You are a helpful and honest assistant."},
        {"role": "user", "content": "What is the town of Bari, Italy, known for? Provide a short answer."}
    ],
    temperature=0.1,
    max_tokens=4*1024,
    top_p=0.9
)
response.choices[0].message.content

<div class="alert alert-block alert-info">
⚠️ <b>Important:</b> as of LiteLLM v1.67.2, `sagemaker_chat` provider does not not correctly pass the inference component name, causing `HTTPStatusError: Client error '400 Bad Request'`. Please use `sagemaker` provider instead.
</div>