In [None]:
"""
This creates the IAM role for SageMaker  with the appropriate permissions. Make sure you have the bucket created first.
You normally need to run it only once when setting things up
"""

import boto3

import boto3
import json



iam = boto3.client('iam')

bucket_name = 'alexvt-adx-emr-eks'  # replace with your bucket name
role_name = 'SageMakerExecutionRole'  # replace with your preferred role name

assume_role_policy_document = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Principal": {
                "Service": "sagemaker.amazonaws.com"
            },
            "Action": "sts:AssumeRole"
        }
    ]
}

create_role_response = iam.create_role(
    RoleName=role_name,
    AssumeRolePolicyDocument=json.dumps(assume_role_policy_document)
)

policy_document = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Action": "s3:*",
            "Resource": f"arn:aws:s3:::{bucket_name}/*"
        },
        {
            "Action": "logs:PutLogEvents",
            "Effect": "Allow",
            "Resource": "<<ARN_NO>>",
            "Sid": "Logs"
        },
        {
            "Action": [
                "logs:DescribeLogStreams",
                "logs:CreateLogStream",
                "logs:CreateLogGroup"
            ],
            "Effect": "Allow",
            "Resource": "<<ARN_NO>>",
            "Sid": "Logs2"
        }
    ]
}

create_policy_response = iam.create_policy(
    PolicyName=f"{role_name}Policy",
    PolicyDocument=json.dumps(policy_document)
)

iam.attach_role_policy(
    RoleName=role_name,
    PolicyArn=create_policy_response['Policy']['Arn']
)




exec_role_arn=create_role_response['Role']['Arn']



In [67]:
# run this cell only if you need to get exec_role_arn for already created role
import boto3
iam = boto3.client('iam')
role_name = 'SageMakerExecutionRole'
create_role_response = iam.get_role(RoleName=role_name)
exec_role_arn=create_role_response['Role']['Arn']
exec_role_arn

'arn:aws:iam::444975673530:role/SageMakerExecutionRole'

In [68]:
"""
This creates the endpoint with the appropriate permissions.
You can use this to deploy multiple endpoints as long as the model_name, endpoint_name and endpoint_config_name are unique.
Need to create a pull-through cache of public.ecr.aws/y0d1u8z0/llmm-cpu-arm64:latest' in your private ECR repo for this to work.
@see https://docs.aws.amazon.com/AmazonECR/latest/userguide/pull-through-cache.html
"""

model_name = 'llamacpp-arm64-c7-x8-v00'
#Graviton
#image = '444975673530.dkr.ecr.us-east-1.amazonaws.com/y0d1u8z0/y0d1u8z0/llmm-cpu-arm64:latest'  #private pull-through cache of public.ecr.aws/y0d1u8z0/llmm-cpu-arm64:latest
#x86
image = '444975673530.dkr.ecr.us-east-1.amazonaws.com/y0d1u8z0/y0d1u8z0/llmm-cpu-arm64-full-v00:perf'  #private pull-through cache of public.ecr.aws/y0d1u8z0/llmm-cpu-arm64:latest

endpoint_config_name = 'sm-llama-arm-config-c7-x8-v00'
endpoint_name = 'sm-llama-arm-c7-x8-v00-stream'
instance_type="ml.c7g.8xlarge" # make sure you use correct instance types x86 or graviton 

client = boto3.client('sagemaker', region_name='us-east-1')
response = client.create_model(
    ModelName=model_name,
    PrimaryContainer={
        'Image': image,
        'Mode': 'SingleModel',
    },
    ExecutionRoleArn=exec_role_arn
)



response = client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            'VariantName': 'default',
            'ModelName': model_name,
            'InitialInstanceCount': 1,
            'InstanceType': instance_type,
            'InitialVariantWeight': 1.0,
        },
    ],
)


response = client.create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=endpoint_config_name,
)

In [4]:
response

{'EndpointArn': 'arn:aws:sagemaker:us-east-1:444975673530:endpoint/sm-llama-arm-c7-x8-v00-openblas',
 'ResponseMetadata': {'RequestId': '564ebdd6-c1e8-4598-97da-14cf28c0cfaa',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '564ebdd6-c1e8-4598-97da-14cf28c0cfaa',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '99',
   'date': 'Fri, 13 Oct 2023 19:33:34 GMT'},
  'RetryAttempts': 0}}

In [3]:
"""
Here we define the functionality to interact with endpoint. First we need to configure it to load the GGUF or GGML models and then we can do the inference
"""

import boto3
import json

sagemaker_runtime = boto3.client('sagemaker-runtime', region_name='us-east-1')


def invoke_sagemaker_endpoint(endpoint_name, llama_args):
    payload = {
        'inference': True,
        'configure': False,
        'args': llama_args
    }
    response = sagemaker_runtime.invoke_endpoint(
        EndpointName=endpoint_name,
        Body=json.dumps(payload),
        ContentType='application/json',
    )
    response_body = json.loads(response['Body'].read().decode())
    return response_body

def configure_sagemaker_endpoint(endpoint_name, llama_model_args):
    payload = {
        'configure': True,
        'inference': False,
        'args': llama_model_args
    }
    response = sagemaker_runtime.invoke_endpoint(
        EndpointName=endpoint_name,
        Body=json.dumps(payload),
        ContentType='application/json',
    )
    response_body = json.loads(response['Body'].read().decode())
    return response_body


In [None]:
"""
Wait until your model is InService.

This is to configure the model
overwrite 'bucket' and 'key' with your path to the model file.
set 'n_threads': NUMBER_OF_VPCUS - 1 to use all available VPCUS.
Execute this cell each time you want to load a new model into the endpoint without having to redeploy anything. 
Loading model from S3 usualy takes 20-30 seconds but depends on loading speed from S3.
"""

llama_model_args = {
    "bucket":"alexvt-adx-emr-eks",
    "key": "llama-2-7b-chat.Q5_K_M.gguf",
    "n_ctx": 1024,
    "n_parts": -1,
    "n_gpu_layers": 0,
    "seed": 1411,
    "f16_kv": True,
    "logits_all": False,
    "vocab_only": False,
    "use_mmap": False,
    "use_mlock": True,
    "embedding": False,
    "n_threads": None,
    "n_batch": 512,
    "last_n_tokens_size": 64,
    "lora_base": None,
    "lora_path": None,
    "low_vram": False,
    "tensor_split": None,
    "rope_freq_base": 10000,
    "rope_freq_scale": 1,
    "verbose": False,
}


configuration = configure_sagemaker_endpoint(endpoint_name,llama_model_args)
configuration

{'status': 'success'}

In [16]:
endpoint_name

'sm-llama-arm-c7-x8-v00-stream'

In [None]:
"""
Execute this cell each time you do the inference. Use the prompt format specific to the model you loaded.  
"""
template2 = """Answer the question as truthfully as possible by using the provided informaiton in >>CONTEXT<<. If the answer is not contained within the >>CONTEXT<<, respond with "I can't answer that".

>>CONTEXT<<:
The overall goal of choosing a number of shards is to distribute an index evenly\n                across all data nodes in the cluster. However, these shards shouldn't be too large\n                or too numerous. A general guideline is to try to keep shard size between\n                10–30 GiB for workloads where search latency is a key performance objective,\n                and 30–50 GiB for write-heavy workloads such as log analytics.\n\nLarge shards can make it difficult for OpenSearch to recover from failure, but\n                because each shard uses some amount of CPU and memory, having too many small shards\n                can cause performance issues and out of memory errors. In other words, shards should\n                be small enough that the underlying OpenSearch Service instance can handle them, but not so small\n                that they place needless strain on the hardware. Shard to CPU ratio – When a shard is\n                involved in an indexing or search request, it uses a vCPU to process the request. As\n                a best practice, use an initial scale point of 1.5 vCPU per shard. If your instance\n                type has 8 vCPUs, set your data node count so that each node has no more than six\n                shards. Note that this is an approximation. Be sure to test your workload and scale\n                your cluster accordingly.\n\nFor storage volume, shard size, and instance type recommendations, see the\n                following resources:\n\nSizing Amazon OpenSearch Service domains\n\nPetabyte scale in Amazon OpenSearch Service\n\nAvoid storage skew

>>QUESTION<<:
if i have 8vcpus, how many shards i can use?

>>ANSWER<<:
"""


llama_args = {
    "prompt": template2,
    "max_tokens": 1000,
    "temperature": 0.1,
    "top_p": 0.5,
    "logprobs": None,
    "echo": False,
    "stop": [],
    "frequency_penalty": 0,
    "presence_penalty": 0,
    "repeat_penalty": 1.1,
    "top_k": 40,
    "stream": False,
    "tfs_z": 1,
    "mirostat_mode": 0,
    "mirostat_tau": 5,
    "mirostat_eta": 0.1,
    "model": None,
}

inference = invoke_sagemaker_endpoint(endpoint_name,llama_args)
inference

{'id': 'cmpl-5060b3c9-bffe-4e15-909e-451c9909798e',
 'object': 'text_completion',
 'created': 1697010565,
 'model': '/app/llm_model.bin',
 'choices': [{'text': 'Based on the information provided in the >>CONTEXT<<, if you have 8 vCPUs, you can use a maximum of 6 shards per data node.',
   'index': 0,
   'logprobs': None,
   'finish_reason': 'stop'}],
 'usage': {'prompt_tokens': 443, 'completion_tokens': 36, 'total_tokens': 479}}

In [76]:
#use with SM-compatible endpoint format

import boto3
import json

sagemaker_runtime = boto3.client('sagemaker-runtime', region_name='us-east-1')

def invoke_sagemaker_streaming_endpoint(endpoint_name, payload):
    response = sagemaker_runtime.invoke_endpoint_with_response_stream(
        EndpointName=endpoint_name,
        Body=json.dumps(payload),
        ContentType='application/json',
    )    
    event_stream = response['Body']
    for line in event_stream:
        print(line['PayloadPart']['Bytes'].decode("utf-8"), end='')

def invoke_sagemaker_endpoint(endpoint_name, payload):
    response = sagemaker_runtime.invoke_endpoint(
        EndpointName=endpoint_name,
        Body=json.dumps(payload),
        ContentType='application/json',
    )
    response_body = json.loads(response['Body'].read().decode())
    return response_body

template2 = """Answer the question as truthfully as possible by using the provided informaiton in >>CONTEXT<<.
>>CONTEXT<<:
For example, for an index with five primary shards and one replica, each indexing
            request touches 10 shards. In contrast, search requests are sent to n shards, where n  is
            the number of primary shards. For an index with five primary shards and one replica,
            each search query touches five shards (primary or replica) from that index.

Determine shard and data node counts

Use the following best practices to determine shard and data node counts for your
                domain.

Shard count – The distribution of shards
                to data nodes has a large impact on a domain’s performance. When you have indexes
                with multiple shards, try to make the shard count an even multiple of the data node
                count. This helps to ensure that shards are evenly distributed across data nodes,
                and prevents hot nodes. For example, if you have 12 primary shards, your data node
                count should be 2, 3, 4, 6, or 12. However, shard count is secondary to shard
                size—if you have 5 GiB of data, you should still use a single shard.

>>QUESTION<<:
Gvie instructions on how to determine shard and data node counts for OpenSearch?

>>Answer<<:
"""

payload = {
    "inputs": template2,
    "parameters":{
        "max_new_tokens": 600,
        "top_k": 300,
        "top_p": 0.95,
        "temperature": 0.5,
        "stream": True
    }
}
inference = invoke_sagemaker_streaming_endpoint(endpoint_name,payload)
#inference = invoke_sagemaker_endpoint(endpoint_name,payload)

inference

To determine the shard and data node counts for an OpenSearch cluster, follow these steps:

1. Analyze the data distribution: Determine how evenly distributed your data is across your indexes. If your data is heavily skewed towards one or a few indexes, consider creating additional shards for those indexes to balance the load.
2. Consider shard size: The size of each shard can also impact performance. For example, if you have 5 GiB of data in an index, using a single shard may be more efficient than spreading it across multiple shards.
3. Choose a shard count that is an even multiple of the data node count: This helps ensure that shards are evenly distributed across data nodes and prevents hot nodes. Refer to the best practices section for examples of appropriate shard counts based on data node counts.
4. Determine the number of replicas: Replicas are used for fault tolerance and can improve search performance. Consider the importance of maintaining data consistency and the trade-off b