# Inference with Amazon SageMaker AI

<div class="alert alert-block alert-info">
	⚠️ <b>Important:</b> ⚠️</br>
	Make sure you've run the <code>0-setup/1-required-dependencies.ipynb</code> notebook in this repository before proceeding.</br>
	Make sure you've deployed the model according to <code>0-setup/2-setup-sagemaker-endpoint.ipynb</code> before proceeding.
</div>

Fetch the `SAGEMAKER_ENDPOINT_NAME` that was deployed during prerequisites.

In [None]:
%store -r SAGEMAKER_ENDPOINT_NAME
print(f"Endpoint name: {SAGEMAKER_ENDPOINT_NAME}")

In [None]:
import boto3
import json

# Initialize boto3 clients for SageMaker inference
boto_session = boto3.Session()
region = boto_session.region_name
sagemaker_runtime = boto3.client('sagemaker-runtime', region_name=region)

print(f"Region: {region}")
print("Using boto3 sagemaker-runtime client for inference")

Synchronous answer:

In [None]:
response = predict(SAGEMAKER_ENDPOINT_NAME, payload)
print(response["choices"][0]["message"]["content"])
print(response["usage"])

Streaming response:

In [None]:
response = predict_stream(SAGEMAKER_ENDPOINT_NAME, payload)
partial_chunk = ""
for event in response:
    chunk = event["PayloadPart"]["Bytes"].decode("utf-8")
    partial_chunk += chunk
    try:
        choice = json.loads(partial_chunk)["choices"][0]
        partial_chunk = ""
        print(choice["delta"]["content"], end="", flush=False)
    except json.JSONDecodeError:
        continue

### Using Boto3 and the Messages API (for compatible models only)

In [None]:
import boto3

runtime = boto3.client("sagemaker-runtime")

In [None]:
%%time
payload = {
    "messages": [
        {"role": "system", "content": "You are a helpful and honest assistant."},
        {"role": "user", "content": "What is the town of Bari, Italy, known for? Provide a short answer."}
    ],
    "max_tokens": 4*1024,
    "temperature": 0.1,
    "top_p": 0.9
}

response = runtime.invoke_endpoint(
    EndpointName=SAGEMAKER_ENDPOINT_NAME,
    ContentType='application/json',
    Body=json.dumps(payload)
)

result = json.loads(response['Body'].read().decode())
print(result['choices'][0]['message']["content"])

In [None]:
%%time
payload = {
    "messages": [
        {"role": "system", "content": "You are a helpful and honest assistant."},
        {"role": "user", "content": "What is the town of Bari, Italy, known for? Provide a short answer."}
    ],
    "max_tokens": 4*1024,
    "temperature": 0.1,
    "top_p": 0.9,
	"stream": True,
	"stream_options":{'include_usage': True}
}


response = runtime.invoke_endpoint_with_response_stream(
	EndpointName=SAGEMAKER_ENDPOINT_NAME,
	ContentType='application/json',
	Body=json.dumps(payload)
)

partial_chunk = ""
for event in response["Body"]:
	chunk = event["PayloadPart"]["Bytes"].decode("utf-8")
	partial_chunk += chunk
	try:
		delta = json.loads(partial_chunk)["choices"][0]["delta"]
		partial_chunk = ""
		print(delta["content"], end="", flush=False)
	except json.JSONDecodeError:
		continue
print()

## Using LiteLLM

In [None]:
from litellm import completion

os.environ["AWS_REGION_NAME"] = boto3.Session().region_name

In [None]:

response = completion(
    model=f"sagemaker_chat/{SAGEMAKER_ENDPOINT_NAME}", 
    messages=[
        {"role": "system", "content": "You are a helpful and honest assistant."},
        {"role": "user", "content": "What is the town of Bari, Italy, known for? Provide a short answer."}
    ],
    temperature=0.1,
    max_tokens=4*1024,
    top_p=0.9,
)
print(response.choices[0].message.content)
print(response.usage)

In [None]:
response = completion(
    model=f"sagemaker_chat/{SAGEMAKER_ENDPOINT_NAME}", 
    messages=[
        {"role": "system", "content": "You are a helpful and honest assistant."},
        {"role": "user", "content": "What is the town of Bari, Italy, known for? Provide a short answer."}
    ],
    temperature=0.1,
    max_tokens=4*1024,
    top_p=0.9,
    stream=True,
)
for event in response:
    content = event.choices[0].delta.content
    if content:
    	print(content, end="", flush=False)