In [None]:
"""
This creates the IAM role for SageMaker  with the appropriate permissions. Make sure you have the bucket created first.
You normally need to run it only once when setting things up
"""

import boto3

import boto3
import json



iam = boto3.client('iam')

bucket_name = 'alexvt-adx-emr-eks'  # replace with your bucket name
role_name = 'SageMakerExecutionRole'  # replace with your preferred role name

assume_role_policy_document = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Principal": {
                "Service": "sagemaker.amazonaws.com"
            },
            "Action": "sts:AssumeRole"
        }
    ]
}

create_role_response = iam.create_role(
    RoleName=role_name,
    AssumeRolePolicyDocument=json.dumps(assume_role_policy_document)
)

policy_document = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Action": "s3:*",
            "Resource": f"arn:aws:s3:::{bucket_name}/*"
        }
    ]
}

create_policy_response = iam.create_policy(
    PolicyName=f"{role_name}Policy",
    PolicyDocument=json.dumps(policy_document)
)

iam.attach_role_policy(
    RoleName=role_name,
    PolicyArn=create_policy_response['Policy']['Arn']
)




exec_role_arn=create_role_response['Role']['Arn']



In [None]:
# run this cell only if you need to get exec_role_arn for already created role
iam = boto3.client('iam')
role_name = 'SageMakerExecutionRole'
create_role_response = iam.get_role(RoleName=role_name)
exec_role_arn=create_role_response['Role']['Arn']
exec_role_arn

In [11]:
"""
This creates the endpoint with the appropriate permissions.
You can use this to deploy multiple endpoints as long as the model_name, endpoint_name and endpoint_config_name are unique.
Need to create a pull-through cache of public.ecr.aws/y0d1u8z0/llmm-cpu-arm64:latest' in your private ECR repo for this to work.
@see https://docs.aws.amazon.com/AmazonECR/latest/userguide/pull-through-cache.html
"""

model_name = 'llamacpp-arm64-c5-x9'
#Graviton
#image = '444975673530.dkr.ecr.us-east-1.amazonaws.com/y0d1u8z0/y0d1u8z0/llmm-cpu-arm64:latest'  #private pull-through cache of public.ecr.aws/y0d1u8z0/llmm-cpu-arm64:latest
#x86
image = '444975673530.dkr.ecr.us-east-1.amazonaws.com/y0d1u8z0/y0d1u8z0/llmm-cpu-amd64:latest'  #private pull-through cache of public.ecr.aws/y0d1u8z0/llmm-cpu-arm64:latest

endpoint_config_name = 'sm-llama-arm-config-c5-x9'
endpoint_name = 'sm-llama-arm-c5-x9'
instance_type="ml.c5.9xlarge" # make sure you use correct instance types x86 or graviton 

client = boto3.client('sagemaker', region_name='us-east-1')
response = client.create_model(
    ModelName=model_name,
    PrimaryContainer={
        'Image': image,
        'Mode': 'SingleModel',
    },
    ExecutionRoleArn=exec_role_arn
)



response = client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            'VariantName': 'default',
            'ModelName': model_name,
            'InitialInstanceCount': 1,
            'InstanceType': instance_type,
            'InitialVariantWeight': 1.0,
        },
    ],
)


response = client.create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=endpoint_config_name,
)

In [None]:
response

In [14]:
"""
Here we define the functionality to interact with endpoint. First we need to configure it to load the GGUF or GGML models and then we can do the inference
"""

import boto3
import json

sagemaker_runtime = boto3.client('sagemaker-runtime', region_name='us-east-1')


def invoke_sagemaker_endpoint(endpoint_name, llama_args):
    payload = {
        'inference': True,
        'configure': False,
        'args': llama_args
    }
    response = sagemaker_runtime.invoke_endpoint(
        EndpointName=endpoint_name,
        Body=json.dumps(payload),
        ContentType='application/json',
    )
    response_body = json.loads(response['Body'].read().decode())
    return response_body

def configure_sagemaker_endpoint(endpoint_name, llama_model_args):
    payload = {
        'configure': True,
        'inference': False,
        'args': llama_model_args
    }
    response = sagemaker_runtime.invoke_endpoint(
        EndpointName=endpoint_name,
        Body=json.dumps(payload),
        ContentType='application/json',
    )
    response_body = json.loads(response['Body'].read().decode())
    return response_body


In [19]:
"""
Wait until your model is InService.

This is to configure the model
overwrite 'bucket' and 'key' with your path to the model file.
set 'n_threads': NUMBER_OF_VPCUS - 1 to use all available VPCUS.
Execute this cell each time you want to load a new model into the endpoint without having to redeploy anything. 
Loading model from S3 usualy takes 20-30 seconds but depends on loading speed from S3.
"""

llama_model_args = {
    "bucket":"alexvt-adx-emr-eks",
    "key":"llama-2-7b-arguments.Q4_K_M.gguf", 
    "n_ctx": 1024,
    "n_parts": -1,
    "n_gpu_layers": 0,
    "seed": 1411,
    "f16_kv": True,
    "logits_all": False,
    "vocab_only": False,
    "use_mmap": True,
    "use_mlock": False,
    "embedding": False,
    "n_threads": None,
    "n_batch": 512,
    "last_n_tokens_size": 64,
    "lora_base": None,
    "lora_path": None,
    "low_vram": False,
    "tensor_split": None,
    "rope_freq_base": 10000,
    "rope_freq_scale": 1,
    "verbose": False,
}


configuration = configure_sagemaker_endpoint(endpoint_name,llama_model_args)

In [None]:
configuration
#endpoint_name

In [None]:
"""
Execute this cell each time you do the inference. Use the prompt format specific to the model you loaded.  
"""

llama_args = {
    "prompt": "[INST]Give concise answer to the question. Qiestion: How to define optimal shard size in Amazon Opensearch?[/INST]",
    "max_tokens": 800,
    "temperature": 0.1,
    "top_p": 0.5,
    "logprobs": None,
    "echo": False,
    "stop": [],
    "frequency_penalty": 0,
    "presence_penalty": 0,
    "repeat_penalty": 1.1,
    "top_k": 40,
    "stream": False,
    "tfs_z": 1,
    "mirostat_mode": 2,
    "mirostat_tau": 5,
    "mirostat_eta": 0.1,
    "model": None,
}

inference = invoke_sagemaker_endpoint(endpoint_name,llama_args)
inference