In [None]:
%pip install sagemaker boto3 litellm -qU

<div class="alert alert-block alert-info">
<center>⚠️ <b>Important:</b> Please restart the kernel after installing the dependencies. ⚠️</center>
</div>

## Deploy the model from SageMaker JumpStart on a SageMaker Inference endpoint

In [None]:
from sagemaker.jumpstart.model import JumpStartModel
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer
from sagemaker.enums import EndpointType
from sagemaker.compute_resource_requirements.resource_requirements import ResourceRequirements


resources = ResourceRequirements(
    requests = {
        "num_accelerators": 4, # Number of accelerators required
        "memory": 96*1024,  # Minimum memory required in Mb (required)
        "copies": 1,
    }
)

model = JumpStartModel(
    model_id="huggingface-llm-mistral-small-24B-Instruct-2501", model_version="*",
    instance_type="ml.g5.12xlarge"
)
predictor = model.deploy(
    accept_eula=True,
    initial_instance_count=1,
    instance_type="ml.g5.12xlarge",
    serializer=JSONSerializer(), deserializer=JSONDeserializer(),
    endpoint_type=EndpointType.INFERENCE_COMPONENT_BASED,
    resources=resources,
    managed_instance_scaling={
        "MinInstanceCount": 0,
        "MaxInstanceCount": 1
    }
)

## Test it

### Using the Predictor object from the SageMaker Python SDK

In [None]:
try: 
    predictor
except:
    from sagemaker.predictor import Predictor
    from sagemaker.serializers import JSONSerializer
    from sagemaker.deserializers import JSONDeserializer
    
    endpoint_name = "REPLACE-WITH-ENDPOINT-NAME"
    component_name = "REPLACE-WITH-INFERENCE-COMPONENT-NAME"
    
    predictor = Predictor(
        endpoint_name=endpoint_name, component_name=component_name,
        serializer=JSONSerializer(), deserializer=JSONDeserializer()
    )

In [None]:
%%time
prompt = "What is the town of Bari, Italy, known for?"
payload = {
    "messages": [
        {
            "role": "user",
            "content": prompt
        }
    ],
    "max_tokens": 4000,
    "temperature": 0.1,
    "top_p": 0.9,
}

response = predictor.predict(payload)
print(response['choices'][0]['message']['content'])

### Using Boto3

In [None]:
%%time
import boto3
import json

payload = {
    "inputs": "What is the town of Bari, Italy, known for? Provide a short answer.",
    "parameters": {
        "max_new_tokens": 4*1024,
        "top_p": 0.9,
        "temperature": 0.2,
    }
}

runtime = boto3.client('sagemaker-runtime')
response = runtime.invoke_endpoint(
    EndpointName=endpoint_name,
    InferenceComponentName=component_name or None,
    ContentType='application/json',
    Body=json.dumps(payload)
)

result = json.loads(response['Body'].read().decode())
print(result['generated_text'])

### Using Boto3 and the Messages API (for compatible models only)

In [None]:
%%time
payload = {
    "messages": [
        {"role": "system", "content": "You are a helpful and honest assistant."},
        {"role": "user", "content": "What is the town of Bari, Italy, known for? Provide a short answer."}
    ],
    "max_tokens": 4*1024,
    "parameters": {
        "top_p": 0.9,
        "temperature": 0.6,
    }
}

response = runtime.invoke_endpoint(
    EndpointName=endpoint_name,
    InferenceComponentName=component_name,
    ContentType='application/json',
    Body=json.dumps(payload)
)

result = json.loads(response['Body'].read().decode())
print(result['choices'][0]['message'])

## Using LiteLLM

In [None]:
from litellm import completion

response = completion(
    model=f"sagemaker/{endpoint_name}", 
    model_id=component_name,
    messages=[
        {"role": "system", "content": "You are a helpful and honest assistant."},
        {"role": "user", "content": "What is the town of Bari, Italy, known for? Provide a short answer."}
    ],
    temperature=0.2,
    max_tokens=1024
)
response.choices[0].message.content

<div class="alert alert-block alert-info">
⚠️ <b>Important:</b> as of LiteLLM v1.67.2, `sagemaker_chat` provider does not not correctly pass the inference component name, causing `HTTPStatusError: Client error '400 Bad Request'`. Please use `sagemaker` provider instead.
</div>

In [None]:
from litellm import completion

response = completion(
    model=f"sagemaker_chat/{endpoint_name}",
    model_id=component_name,
    messages=[
        {"role": "system", "content": "You are a helpful and honest assistant."},
        {"role": "user", "content": "What is the town of Bari, Italy, known for? Provide a short answer."}
    ],
    temperature=0.2,
    max_tokens=1024
)
response.choices[0].message.content

## Function calling with Boto3

In [None]:
def get_top_song(sign: str) -> dict:
    """Get the most popular song played on a radio station."""
    sign = sign.upper()
    return {
        "sign": sign,
        "song": "In the End",
        "artist": "Linkin Park",
    }


In [None]:
tools_description = {
    "name": "top_song",
    "description": "Get the most popular song played on a radio station.",
    "parameters": {
        "type": "object",
        "properties": {
            "sign": {
                "type": "string",
                "description": "The call sign for the radio station for which you want the most popular song. Example calls signs are WZPZ and WKRP."
            }
        },
        "required": ["sign"]
    }
}

In [None]:
system_message = (
            "You are a helpful assistant with access to these tools:\n\n"
            f"{tools_description}\n"
            "Choose the appropriate tool based on the user's question. "
            "If no tool is needed, reply directly.\n\n"
            "IMPORTANT: When you need to use a tool, you must ONLY respond with "
            "the exact JSON object format below, nothing else:\n"
            "{\n"
            '    "tool": "tool-name",\n'
            '    "arguments": {\n'
            '        "argument-name": "value"\n'
            "    }\n"
            "}\n\n"
            "After receiving a tool's response:\n"
            "1. Transform the raw data into a natural, conversational response\n"
            "2. Keep responses concise but informative\n"
            "3. Focus on the most relevant information\n"
            "4. Use appropriate context from the user's question\n"
            "5. Avoid simply repeating the raw data\n\n"
            "Please use only the tools that are explicitly defined above.\n\n"
)

In [None]:
payload = {
    "messages": [
        {"role": "system", "content": system_message},
        {"role": "user", "content": "What is the most popular song on WZPZ?"}
    ],
    "max_tokens": 4*1024,
    "parameters": {
        "top_p": 0.9,
        "temperature": 0.6,
    }
}

import boto3
import json

runtime = boto3.client('sagemaker-runtime')
response = runtime.invoke_endpoint(
    EndpointName="hf-llm-mistral-small-24b-instruct-2501-2025-04-07-10-48-04-846",
    InferenceComponentName="hf-llm-mistral-small-24b-instruct-2501-2025-04--1744023037-a1d1",
    ContentType='application/json',
    Body=json.dumps(payload)
)

result = json.loads(response['Body'].read().decode())
content = result['choices'][0]['message']['content']
# Regexp the JSON from the content
import re
match = re.search(r"\{.*\}", content, re.DOTALL)
content = json.loads(match.group(0))
content

In [None]:
payload["messages"]

In [None]:
# Loop until no more tools are needed
final_response_reached = False
while not final_response_reached:
    tool_name = content["tool"]
    tool_arguments = content["arguments"]
    if tool_name == "top_song":
        tool_response = get_top_song(tool_arguments["sign"])
    else:
        raise ValueError(f"Unknown tool: {tool_name}")
    payload["messages"].append({
        "role": "user",
        "content": json.dumps(tool_response)
    })
    response = runtime.invoke_endpoint(
        EndpointName="hf-llm-mistral-small-24b-instruct-2501-2025-04-07-10-48-04-846",
        InferenceComponentName="hf-llm-mistral-small-24b-instruct-2501-2025-04--1744023037-a1d1",
        ContentType='application/json',
        Body=json.dumps(payload)
    )
    result = json.loads(response['Body'].read().decode())
    content = result['choices'][0]['message']['content']
    match = re.search(r"\{.*\}", content, re.DOTALL)
    try:
        content = json.loads(match.group(0))
    except:
        final_response_reached = True
        final_response = content
        break
final_response

## Function Calling with LiteLLM

> **Note**: as of v1.67.2, LiteLLM `sagemaker` and `sagemaker_chat` providers do not support tool calling. The cells below **will not** work.