In [32]:
"""
This creates the IAM role for SageMaker  with the appropriate permissions. Make sure you have the bucket created first.
You normally need to run it only once when setting things up
"""

import boto3

import boto3
import json



iam = boto3.client('iam')

bucket_name = 'alexvt-adx-emr-eks'  # replace with your bucket name
role_name = 'SageMakerExecutionRole'  # replace with your preferred role name

assume_role_policy_document = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Principal": {
                "Service": "sagemaker.amazonaws.com"
            },
            "Action": "sts:AssumeRole"
        }
    ]
}

create_role_response = iam.create_role(
    RoleName=role_name,
    AssumeRolePolicyDocument=json.dumps(assume_role_policy_document)
)

policy_document = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Action": "s3:*",
            "Resource": f"arn:aws:s3:::*"
        },
        {
            "Effect": "Allow",
            "Action": "ecr:*",
            "Resource": f"arn:aws:ecr:us-east-1:*:repository/*"
        }
    ]
}

create_policy_response = iam.create_policy(
    PolicyName=f"{role_name}Policy",
    PolicyDocument=json.dumps(policy_document)
)

iam.attach_role_policy(
    RoleName=role_name,
    PolicyArn=create_policy_response['Policy']['Arn']
)




exec_role_arn=create_role_response['Role']['Arn']



In [3]:
# run this cell only if you need to get exec_role_arn for already created role
iam = boto3.client('iam')
role_name = 'SageMakerExecutionRole'
create_role_response = iam.get_role(RoleName=role_name)
exec_role_arn=create_role_response['Role']['Arn']
exec_role_arn

'arn:aws:iam::444975673530:role/SageMakerExecutionRole'

In [9]:
"""
This creates the endpoint with the appropriate permissions.
You can use this to deploy multiple endpoints as long as the model_name, endpoint_name and endpoint_config_name are unique.
Need to create a pull-through cache of public.ecr.aws/y0d1u8z0/llmm-cpu-arm64:latest' in your private ECR repo for this to work.
@see https://docs.aws.amazon.com/AmazonECR/latest/userguide/pull-through-cache.html
"""

import boto3

model_name = 'llamacpp-arm64-c7g-x8'
#Graviton
#image = '444975673530.dkr.ecr.us-east-1.amazonaws.com/y0d1u8z0/y0d1u8z0/llmm-cpu-arm64:latest'  #private pull-through cache of public.ecr.aws/y0d1u8z0/llmm-cpu-arm64:latest
#x86
#image = '444975673530.dkr.ecr.us-east-1.amazonaws.com/y0d1u8z0/y0d1u8z0/llmm-cpu-amd64:latest'  #private pull-through cache of public.ecr.aws/y0d1u8z0/llmm-cpu-arm64:latest
image='444975673530.dkr.ecr.us-east-1.amazonaws.com/model-image-repository'
endpoint_config_name = 'sm-llama-arm-config-c7g-x8'
endpoint_name = 'sm-llama-arm-c7g-x8'
instance_type="ml.c7g.8xlarge" # make sure you use correct instance types x86 or graviton 

client = boto3.client('sagemaker', region_name='us-east-1')
response = client.create_model(
    ModelName=model_name,
    PrimaryContainer={
        'Image': image,
        'Mode': 'SingleModel',
    },
    ExecutionRoleArn=exec_role_arn
)



response = client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            'VariantName': 'default',
            'ModelName': model_name,
            'InitialInstanceCount': 1,
            'InstanceType': instance_type,
            'InitialVariantWeight': 1.0,
        },
    ],
)


response = client.create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=endpoint_config_name,
)

In [36]:
response

{'EndpointArn': 'arn:aws:sagemaker:us-east-1:444975673530:endpoint/sm-llama-arm-c5-x9',
 'ResponseMetadata': {'RequestId': 'e854ab6c-214e-475d-896c-5f056aa242fc',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'e854ab6c-214e-475d-896c-5f056aa242fc',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '86',
   'date': 'Tue, 05 Dec 2023 16:27:31 GMT'},
  'RetryAttempts': 0}}

In [10]:
"""
Here we define the functionality to interact with endpoint. First we need to configure it to load the GGUF or GGML models and then we can do the inference
"""

import boto3
import json

sagemaker_runtime = boto3.client('sagemaker-runtime', region_name='us-east-1')
#endpoint_name='llm-cpu-llama-2-7b-chat-Endpoint'

def invoke_sagemaker_endpoint(endpoint_name, llama_args):
    payload = {
        'inference': True,
        'configure': False,
        'args': llama_args
    }
    response = sagemaker_runtime.invoke_endpoint(
        EndpointName=endpoint_name,
        Body=json.dumps(llama_args),
        ContentType='application/json',
    )
    response_body = json.loads(response['Body'].read().decode())
    return response_body

def invoke_sagemaker_streaming_endpoint(endpoint_name, payload):
    response = sagemaker_runtime.invoke_endpoint_with_response_stream(
        EndpointName=endpoint_name,
        Body=json.dumps(payload),
        ContentType='application/json',
    )    
    event_stream = response['Body']
    for line in event_stream:
        try:
            itm = json.loads(line['PayloadPart']['Bytes'].decode("utf-8")[6:])
            print(itm["choices"][0]["text"], end='')
        except:
            print(line['PayloadPart']['Bytes'].decode("utf-8")[6:])


def configure_sagemaker_endpoint(endpoint_name, llama_model_args):
    payload = {
        'configure': True,
        'inference': False,
        'args': llama_model_args
    }

    payload = {"configure": {"bucket":"modeldownloadstack-modelbucketc6ceab13-1qgdn2wkgqda3","key":"llama-2-7b-chat.Q4_K_M.gguf"}}
    response = sagemaker_runtime.invoke_endpoint(
        EndpointName=endpoint_name,
        Body=json.dumps(payload),
        ContentType='application/json',
    )
    #response_body = json.loads(response['Body'].read().decode())
    response_body = response['Body'].read().decode()
    return response_body


In [11]:
"""
Wait until your model is InService.

This is to configure the model
overwrite 'bucket' and 'key' with your path to the model file.
set 'n_threads': NUMBER_OF_VPCUS - 1 to use all available VPCUS.
Execute this cell each time you want to load a new model into the endpoint without having to redeploy anything. 
Loading model from S3 usualy takes 20-30 seconds but depends on loading speed from S3.
"""

llama_model_args = {
    "bucket":"alexvt-adx-emr-eks",
    "key":"llama-2-7b-arguments.Q4_K_M.gguf", 
    "n_ctx": 1024,
    "n_parts": -1,
    "n_gpu_layers": 0,
    "seed": 1411,
    "f16_kv": True,
    "logits_all": False,
    "vocab_only": False,
    "use_mmap": True,
    "use_mlock": False,
    "embedding": False,
    "n_threads": None,
    "n_batch": 512,
    "last_n_tokens_size": 64,
    "lora_base": None,
    "lora_path": None,
    "low_vram": False,
    "tensor_split": None,
    "rope_freq_base": 10000,
    "rope_freq_scale": 1,
    "verbose": False,
}



configuration = configure_sagemaker_endpoint(endpoint_name,llama_model_args)

In [None]:
configuration
#endpoint_name

In [12]:
"""
Execute this cell each time you do the inference. Use the prompt format specific to the model you loaded.  
"""
#"max_tokens": 800,

llama_args = {
    "prompt": "Give concise answer to the question. Qiestion: How to define optimal shard size in Amazon Opensearch?",
    "max_tokens": 128,
    "temperature": 0.1,
    "top_p": 0.5
}

inference = invoke_sagemaker_endpoint(endpoint_name,llama_args)
inference

{'choices': [{'finish_reason': 'stop',
   'index': 0,
   'logprobs': None,
   'text': '\nA. Determine the average number of documents per shard and calculate the ideal shard size based on that number.\nB. Calculate the total amount of memory required for the index and divide it by the number of shards to determine the optimal shard size.\nC. Monitor the performance of your search cluster and adjust the shard size as needed to maintain optimal performance.\nD. Use a formula that takes into account the number of documents, the average document size, and the desired level of parallelism to calculate the ideal shard size.'}],
 'created': 1701884722,
 'id': 'cmpl',
 'model': 'LLaMA_CPP',
 'object': 'text_completion',
 'truncated': False,
 'usage': {'completion_tokens': 116, 'prompt_tokens': 26, 'total_tokens': 142}}