In [2]:
# SageMaker JumpStart provides APIs as part of SageMaker SDK that allow you to deploy and fine-tune models in network isolation using scripts that SageMaker maintains.

from sagemaker.jumpstart.model import JumpStartModel


model_id = "huggingface-llm-falcon-40b-bf16"
endpoint_name ="sm-js-llm-async-falcon-40b-bf16"
endpoint_input = {'inputs': 'Girafatron is obsessed with giraffes, the most glorious animal on the face of this Earth. Giraftron believes all other animals are irrelevant when compared to the glorious majesty of the giraffe.\nDaniel: Hello, Girafatron!\nGirafatron:', 'parameters': {'max_new_tokens': 50, 'top_k': 10, 'return_full_text': False, 'do_sample': True}}



In [3]:
import sagemaker
from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri
import time

sagemaker_session = sagemaker.Session()
region = sagemaker_session.boto_region_name
role = sagemaker.get_execution_role()

In [4]:
s3_bucket=sagemaker_session.default_bucket()
bucket_prefix='falcon-async-inference'

from sagemaker.async_inference.async_inference_config import AsyncInferenceConfig

async_config = AsyncInferenceConfig(
    output_path=f"s3://{s3_bucket}/{bucket_prefix}/output",
    max_concurrent_invocations_per_instance=4,
    # Optionally specify Amazon SNS topics
    # notification_config = {
    # "SuccessTopic": "arn:aws:sns:<aws-region>:<account-id>:<topic-name>",
    # "ErrorTopic": "arn:aws:sns:<aws-region>:<account-id>:<topic-name>",
    # }
)

In [6]:
model = JumpStartModel(model_id=model_id)
predictor = model.deploy(
    initial_instance_count=1,
        instance_type="ml.g5.24xlarge",
        endpoint_name=endpoint_name,
        async_inference_config=async_config)


--------------------------!

In [None]:
# Check if endpoint is created
endpoint_description = sagemaker_session.sagemaker_client.describe_endpoint(EndpointName=endpoint_name)
# code to invoke async endpoint for inference



In [None]:
response = predictor.predict(endpoint_input)
print(f"Inference:\nInput: {endpoint_input}\nResponse: {response}\n")