## Install necessary packages and Import libraries 

In [2]:
%pip install --upgrade pip

[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
%pip install --upgrade sagemaker==2.194.0 boto3 transformers  PyMuPDF langchain

[0mNote: you may need to restart the kernel to use updated packages.


### Restart the kernel to ensure all packages are enabled 

## Part 1 Deploy the Llama-2-13b LLM model as sagemake rendpoint on inf2.48xlarge instance type

In [2]:
import boto3
import sagemaker
from sagemaker import Model, image_uris 
import io
sagemaker.__version__, boto3.__version__

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


('2.194.0', '1.28.73')

In [3]:
AWS_REGION="us-east-1"
boto3_session=boto3.session.Session(region_name=AWS_REGION)
smr = boto3.client('sagemaker-runtime', region_name=AWS_REGION)
sm = boto3.client('sagemaker', region_name=AWS_REGION)
role = sagemaker.get_execution_role()
sess = sagemaker.session.Session(boto3_session,                                 
                                 sagemaker_client=sm,
                                 sagemaker_runtime_client=smr) # sagemaker session for interacting with different AWS APIs

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [4]:
instance_type = "ml.inf2.48xlarge" 
model_name="smep-inf2-llama2-13b-chat-model"
endpoint_name = f"{model_name}-endpoint"

In [5]:
## You can change these values if you have your own container image and model weights stored in S3
prefix='torchserve'
external_image_uri = '102048127330.dkr.ecr.us-east-1.amazonaws.com/neuronx:2-14-1'
external_s3_uri = f"s3://gai-model-artifacts/{prefix}/model_store/llama-2-13b-chat/"

In [6]:
## instantiate model instance 
model = Model(
name=model_name,
# Enable SageMaker uncompressed model artifacts model_data={
model_data = {    
    "S3DataSource": {
                "S3Uri": external_s3_uri,
                "S3DataType": "S3Prefix",
                "CompressionType": "None",
        }
    },
    image_uri=external_image_uri,
    role=role,
    sagemaker_session=sess,
    env={"TS_INSTALL_PY_DEP_PER_MODEL": "true"},
)

#### Next, we will deploy the LLM as sagemaker endpoint. It can take upto 15 minutes to deploy

In [7]:
%%time
model.deploy( 
    initial_instance_count=1,
    instance_type=instance_type,
    endpoint_name=endpoint_name,
    volume_size=512, # increase the size to store large model, mandatory for inf2 instance types
    model_data_download_timeout=3600, # increase the timeout to download large model 
    container_startup_health_check_timeout=600, # increase the timeout to load large model
)

Your model is not compiled. Please compile your model before using Inferentia.


---------------!CPU times: user 110 ms, sys: 5.73 ms, total: 115 ms
Wall time: 8min 3s


In [8]:

## define class to parse the streaming responses when endpoint is invoked

class Parser: 
    """
    A helper class for parsing the byte stream input.
    The output of the model will be in the following format:
    ```
    b'{"outputs": [" a"]}\n'
    b'{"outputs": [" challenging"]}\n'
    b'{"outputs": [" problem"]}\n'
    ...
    ```
    While usually each PayloadPart event from the event stream will contain a byte
array
    with a full json, this is not guaranteed and some of the json objects may be split
across
    PayloadPart events. For example:
    ```
    {'PayloadPart': {'Bytes': b'{"outputs": '}}
    {'PayloadPart': {'Bytes': b'[" problem"]}\n'}}
    This class accounts for this by concatenating bytes written via the 'write'
function
    and then exposing a method which will return lines (ending with a '\n' character)
within
    the buffer via the 'scan_lines' function. It maintains the position of the last
read
    position to ensure that previous bytes are not exposed again.
    """

    def __init__(self): 
        self.buff = io.BytesIO() 
        self.read_pos = 0

    def write(self, content): 
        self.buff.seek(0, io.SEEK_END) 
        self.buff.write(content)
        data = self.buff.getvalue()

    def scan_lines(self): 
        self.buff.seek(self.read_pos)
        for line in self.buff.readlines():
            if line[-1] != b'\n': 
                self.read_pos += len(line) 
                yield line[:-1]

    def reset(self): 
        self.read_pos = 0
        

## define a function to return inference results

def run_infer(endpoint_name, body):
    resp = smr.invoke_endpoint_with_response_stream(EndpointName=endpoint_name,
                                                    Body=body,
                                                    ContentType="application/json")
    event_stream = resp['Body'] 
    parser = Parser()
    results = ''
    for event in event_stream:
        parser.write(event['PayloadPart']['Bytes']) 
        for line in parser.scan_lines():
            #print(line.decode("utf-8"), end=' ')
            results = results + line.decode("utf-8") + ' '
    return results

In [9]:
%%time
body = """User: Explain the self-attention mechanism that Transformers use like I'm ten years old.
          Assistant:""".encode('utf-8')
results = run_infer(endpoint_name, body)
print(f'This is the result of inference request #1 \n\n {results}')

This is the result of inference request #1 

 User: Explain the self-attention mechanism that Transformers use like I'm ten years old.           Assistant: Sure! So, you know how sometimes you're talking to someone and you want to make sure they understand what you're saying? Like, you might say "Hey, did you hear me?" or "Can you repeat that back to me?"           That's kind of like what the self-attention mechanism does in Transformers. It's like a way for the model to say "Hey, did you hear me?" to itself, and then listen to what it just said to make sure it's correct.           So, imagine you're trying to translate a sentence from English to Spanish. The self-attention mechanism would let the model look at each word in the sentence, and then say "Hey, did I get this right?" to itself. If it didn't get it right, it would listen to what it just said and try again.           It's like having a little voice in your head that's always checking to make sure you're understanding what yo

In [None]:
%%time
body2 = """
    Write a concise summary of the text, return your responses with 2 lines that cover
the key points of the following text.
    ```
    Intended Use
    Intended Use Cases Llama 2 is intended for commercial and research use in English.
    Tuned models are intended for assistant-like chat, whereas pretrained models can
be
    adapted for a variety of natural language generation tasks.
    To get the expected features and performance for the chat versions, a
    specific formatting needs to be followed, including the INST and <<SYS>> tags,
    BOS and EOS tokens, and the whitespaces and breaklines in between (
        we recommend calling strip() on inputs to avoid double-spaces).
        See our reference code in github for details: chat_completion.
    Out-of-scope Uses Use in any manner that violates applicable laws or regulations
    (including trade compliance laws).Use in languages other than English.
    Use in any other way that is prohibited by the Acceptable Use Policy and Licensing
Agreement for Llama 2.
    ```
    SUMMARY:
    """.encode('utf-8')
results = run_infer(endpoint_name, body2)
print(f'This is the response for warm up inference request #2:\n\n {results}')

## Run benchmark tests 

In [None]:
from datetime import datetime 
import time
import json
from transformers import GPT2Tokenizer
import pandas as pd 

## function to initialize gpt2 tokenizer
## we will use tokenizer to count input and output tokens 

def get_tokenizer(id="gpt2"): 
    try:
        tokenizer=GPT2Tokenizer.from_pretrained(id)
        print(f'retrieved tokenizer: {id}') 
    except:
        tokenizer=None
        print(f'unable to retrieve tokenizer: {id}')
    return tokenizer

## function to return count of tokens in the text 

def get_token_count(tokenizer, content: str) -> int: 
    try:
        encoded = tokenizer(content)
        count = len(encoded['input_ids']) 
    except:
        print(f'error counting tokens tokenizer not available')
        count = 0
    return count

def run_test(prompts: list): 
    results = []
    prompt_id = 0
    tokenizer = get_tokenizer() 
    for prompt in prompts:
    
        if prompt_id >= len(prompts): 
            break

        is_error = 0
        start_time = time.time()
        try:
            print(f'prompt id: {prompt_id + 1}')
            input_token_count = get_token_count(tokenizer, prompt)
            print(f'input token count: {input_token_count}')
            start_time = time.time()
            body = prompt.encode('utf-8')
            response = smr.invoke_endpoint(EndpointName=endpoint_name,
                                            Body=body,
                                            ContentType="application/json")
            end_time = time.time()
            latency_ms=int((end_time - start_time)*1000)
            print(f'duration(ms): {latency_ms}')
            response = response["Body"].read().decode("utf8") 
            #response = json.loads(response)
            #output = response[0]['generation']
            output_token_count = get_token_count(tokenizer, response)
            print(f'output token count: {output_token_count}')
        except:
            print(e)
            is_error = 1
            latency_ms=0
            input_token_count = 0
            output_token_count = 0
            end_time = time.time()

        detail = {
                "start_time": datetime.fromtimestamp(start_time).strftime("%Y-%m-%d%H:%M:%S.%f")[:-3],
                "end_time": datetime.fromtimestamp(end_time).strftime("%Y-%m-%d%H:%M:%S.%f")[:-3],
                "input_token_count":input_token_count,
                "output_token_count":output_token_count,
                "latency_ms": latency_ms
         }
        results.append(detail)
        prompt_id += 1 

    print(f'completed test')
    return results


In [None]:
## Run the benchmark tests

test_prompts = [
    "User: give me a recipe for an old fashioned cocktail\nAssistant:",
    "User: Write a poem about open source machine learning. \nAssistant:",
    "User: give me a recipe for home made mayonnaise \nAssistant:",
    "User: Explain generative AI to me like I am a 5th grade student who is 12 years old. \nAssistant:",
    "User: Respond to this question only based on the information provided here. Cats like dogs, and dogs like rabbits. Cats like anything that dogs like. I really really dislike rabbits. How do cats feel about rabbits?\nAssistant:"
]
results = run_test(prompts=test_prompts)

In [None]:
results = pd.DataFrame.from_dict(results)
print(f"""Mean Output Tokens : {results['output_token_count'].mean()} tokens""")
print(f"""Mean Latency : {results['latency_ms'].mean()} milliseconds""")
print(f"""Total Output Tokens : {results['output_token_count'].sum()} tokens""")
print(f"""Total Duration : {results['latency_ms'].sum()/1000.0} seconds""")

## Last Step: Clean up endpoint and associated artifacts 

In [None]:
sm.delete_endpoint(EndpointName=endpoint_name)
sm.delete_endpoint_config(EndpointConfigName=endpoint_name)
sm.delete_model(ModelName=model_name)
print('Cleanup Done!')