In [None]:
import ipywidgets as widgets
from IPython.display import display

# A list of SageMaker instances
sagemaker_instances = ["ml.m5.large", "ml.m5.xlarge", "ml.m5.2xlarge", "ml.m5.4xlarge"]

# Create a Dropdown widget
dropdown = widgets.Dropdown(
    options=sagemaker_instances,
    description='Instances:',
)

# Display the dropdown
display(dropdown)

In [None]:
"""
This creates the endpoint with the appropriate permissions. Make sure you have the bucket created first.
"""

import boto3

import boto3
import json


instance_type=dropdown.value



iam = boto3.client('iam')

bucket_name = '<your-bucket-name>'  # replace with your bucket name
role_name = 'SageMakerExecutionRole'  # replace with your preferred role name
model_name = '<model-name>'
image = '<account-id>.dkr.ecr.<region>.amazonaws.com/<image-name>:latest'
endpoint_config_name = '<endpoint-config-name>'
endpoint_name = '<endpoint-name>'
client = boto3.client('sagemaker')

assume_role_policy_document = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Principal": {
                "Service": "sagemaker.amazonaws.com"
            },
            "Action": "sts:AssumeRole"
        }
    ]
}

create_role_response = iam.create_role(
    RoleName=role_name,
    AssumeRolePolicyDocument=json.dumps(assume_role_policy_document)
)

policy_document = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Action": "s3:*",
            "Resource": f"arn:aws:s3:::{bucket_name}/*"
        }
    ]
}

create_policy_response = iam.create_policy(
    PolicyName=f"{role_name}Policy",
    PolicyDocument=json.dumps(policy_document)
)

iam.attach_role_policy(
    RoleName=role_name,
    PolicyArn=create_policy_response['Policy']['Arn']
)




exec_role_arn=create_role_response['Role']['Arn']





response = client.create_model(
    ModelName=model_name,
    PrimaryContainer={
        'Image': image,
        'Mode': 'SingleModel',
    },
    ExecutionRoleArn=exec_role_arn
)



response = client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            'VariantName': 'default',
            'ModelName': model_name,
            'InitialInstanceCount': 1,
            'InstanceType': instance_type,
            'InitialVariantWeight': 1.0,
        },
    ],
)


response = client.create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=endpoint_config_name,
)


In [None]:
"""
Here we define the functionality to 
"""

import boto3
import json

sagemaker_runtime = boto3.client('sagemaker-runtime')


def invoke_sagemaker_endpoint(endpoint_name, llama_args):
    payload = {
        'inference': True,
        'args': llama_args
    }
    response = sagemaker_runtime.invoke_endpoint(
        EndpointName=endpoint_name,
        Body=json.dumps(payload),
        ContentType='application/json',
    )
    response_body = json.loads(response['Body'].read().decode())
    return response_body

def configure_sagemaker_endpoint(endpoint_name, llama_model_args):
    payload = {
        'configure': True,
        'args': llama_model_args
    }
    response = sagemaker_runtime.invoke_endpoint(
        EndpointName=endpoint_name,
        Body=json.dumps(payload),
        ContentType='application/json',
    )
    response_body = json.loads(response['Body'].read().decode())
    return response_body


In [None]:
"""
So now, if you'd like to use this, Once you've configured and deployed your environment:
"""
llama_model_args = {
    "bucket":bucket_name,
    "key":"mymodel.bin",
    "n_ctx": 512,
    "n_parts": -1,
    "n_gpu_layers": 0,
    "seed": 1337,
    "f16_kv": True,
    "logits_all": False,
    "vocab_only": False,
    "use_mmap": True,
    "use_mlock": False,
    "embedding": False,
    "n_threads": None,
    "n_batch": 512,
    "last_n_tokens_size": 64,
    "lora_base": None,
    "lora_path": None,
    "low_vram": False,
    "tensor_split": None,
    "rope_freq_base": 10000,
    "rope_freq_scale": 1,
    "verbose": False,
}


llama_args = {
    "prompt": "The quick brown fox",
    "max_tokens": 128,
    "temperature": 0.8,
    "top_p": 0.95,
    "logprobs": None,
    "echo": False,
    "stop": [],
    "frequency_penalty": 0,
    "presence_penalty": 0,
    "repeat_penalty": 1.1,
    "top_k": 40,
    "stream": False,
    "tfs_z": 1,
    "mirostat_mode": 0,
    "mirostat_tau": 5,
    "mirostat_eta": 0.1,
    "model": None,
}

configuration = configure_sagemaker_endpoint(endpoint_name,llama_model_args)
inference = invoke_sagemaker_endpoint(endpoint_name,llama_args)