# lama2-7b Model with response streaming

### Import required libraries and establish session using SageMaker SDK

In [1]:
!pip install sagemaker boto3 huggingface_hub --upgrade --quiet

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
awscli 1.29.36 requires botocore==1.31.36, but you have botocore 1.31.58 which is incompatible.
awscli 1.29.36 requires s3transfer<0.7.0,>=0.6.0, but you have s3transfer 0.7.0 which is incompatible.[0m[31m
[0m

In [2]:
import sagemaker
import jinja2
from sagemaker import image_uris
import boto3
import os
import time
import json
from pathlib import Path

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [3]:
role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
bucket = sess.default_bucket()  # bucket to house artifacts

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [4]:
model_bucket = sess.default_bucket()  # bucket to house model artifacts
s3_code_prefix = "hf-large-model-djl/meta-llama/Llama-2-7b-chat-hf/code"  # folder within bucket where code artifact will go

s3_model_prefix = "hf-large-model-djl/meta-llama/Llama-2-7b-chat-hf/model"  # folder within bucket where model artifact will go
region = sess._region_name
account_id = sess.account_id()

s3_client = boto3.client("s3")
sm_client = boto3.client("sagemaker")
smr_client = boto3.client("sagemaker-runtime")

jinja_env = jinja2.Environment()

### [OPTIONAL] Download the model from Hugging Face and upload the model artifacts on Amazon S3

If you intend to download your copy of the model and upload it to a s3 location in your AWS account, please follow the below steps, else you can skip to the next step.

In [5]:
from huggingface_hub import snapshot_download
from huggingface_hub.hf_api import HfFolder
HfFolder.save_token('REPLACE WITH YOUR HF TOKEN')
from pathlib import Path
import os

# - This will download the model into the current directory where ever the jupyter notebook is running
local_model_path = Path(".")
local_model_path.mkdir(exist_ok=True)
model_name = "meta-llama/Llama-2-7b-chat-hf"
# Only download pytorch checkpoint files
allow_patterns = ["*.json", "*.txt", "*.model", "*.safetensors", "*.bin", "*.chk", "*.pth"]

# - Leverage the snapshot library to donload the model since the model is stored in repository using LFS
model_download_path = snapshot_download(
    repo_id=model_name, cache_dir=local_model_path, allow_patterns=allow_patterns
)

Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Downloading (…)d7450235/LICENSE.txt:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

Downloading (…)d7450235/config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading (…)50235/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

In [6]:
# upload files from local to S3 location
pretrained_model_location = sess.upload_data(path=model_download_path, key_prefix=s3_model_prefix)
print(f"Model uploaded to --- > {pretrained_model_location}")

Model uploaded to --- > s3://sagemaker-us-west-2-757967535041/hf-large-model-djl/meta-llama/Llama-2-7b-chat-hf/model


In [7]:
# Cleanup locally stored model files post S3 upload
!rm -rf {model_download_path}

### Define a variable to contain the s3 url of the location that has the model

In [8]:
# Define a variable to contain the s3 url of the location that has the model. For demo purpose, we use Llama-2-7b-fp16 model artifacts from our S3 bucket
# pretrained_model_location = f"s3://sagemaker-example-files-prod-{region}/models/llama-2/7b-chat/"

In [9]:
!rm -rf code_llama2_7b_fp16
!mkdir -p code_llama2_7b_fp16

In [10]:
%%writefile code_llama2_7b_fp16/serving.properties
engine = Python
option.entryPoint = djl_python.huggingface
option.tensor_parallel_degree = 2
batch_size = 64
max_batch_delay = 1000
option.model_loading_timeout = 900
option.model_id = {{model_id}}
option.enable_streaming = true
option.output_formatter=jsonlines
option.paged_attention=false

Writing code_llama2_7b_fp16/serving.properties


In [11]:
# we plug in the appropriate model location into our `serving.properties`
template = jinja_env.from_string(Path("code_llama2_7b_fp16/serving.properties").open().read())
Path("code_llama2_7b_fp16/serving.properties").open("w").write(
    template.render(model_id=pretrained_model_location)
)
!pygmentize code_llama2_7b_fp16/serving.properties | cat -n

     1	[36mengine[39;49;00m[37m [39;49;00m=[37m [39;49;00m[33mPython[39;49;00m[37m[39;49;00m
     2	[36moption.entryPoint[39;49;00m[37m [39;49;00m=[37m [39;49;00m[33mdjl_python.huggingface[39;49;00m[37m[39;49;00m
     3	[36moption.tensor_parallel_degree[39;49;00m[37m [39;49;00m=[37m [39;49;00m[33m2[39;49;00m[37m[39;49;00m
     4	[36mbatch_size[39;49;00m[37m [39;49;00m=[37m [39;49;00m[33m64[39;49;00m[37m[39;49;00m
     5	[36mmax_batch_delay[39;49;00m[37m [39;49;00m=[37m [39;49;00m[33m1000[39;49;00m[37m[39;49;00m
     6	[36moption.model_loading_timeout[39;49;00m[37m [39;49;00m=[37m [39;49;00m[33m900[39;49;00m[37m[39;49;00m
     7	[36moption.model_id[39;49;00m[37m [39;49;00m=[37m [39;49;00m[33ms3://sagemaker-us-west-2-757967535041/hf-large-model-djl/meta-llama/Llama-2-7b-chat-hf/model[39;49;00m[37m[39;49;00m
     8	[36moption.enable_streaming[39;49;00m[37m [39;49;00m=[37m [39;49;00m[33mtrue[39;49;00m[37m[39;49

**Image URI for the DJL container is being used here**

In [12]:
inference_image_uri = image_uris.retrieve(
    framework="djl-deepspeed", region=region, version="0.23.0"
)
print(f"Image going to be used is ---- > {inference_image_uri}")

Image going to be used is ---- > 763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.23.0-deepspeed0.9.5-cu118


**Create the Tarball and then upload to S3 location**

In [13]:
!rm model.tar.gz
!tar czvf model.tar.gz code_llama2_7b_fp16

rm: cannot remove ‘model.tar.gz’: No such file or directory
code_llama2_7b_fp16/
code_llama2_7b_fp16/.ipynb_checkpoints/
code_llama2_7b_fp16/.ipynb_checkpoints/serving-checkpoint.properties
code_llama2_7b_fp16/serving.properties


In [14]:
s3_code_artifact = sess.upload_data("model.tar.gz", bucket, s3_code_prefix)

#### 1.2 Deploy endpoint for Dynamic Batching

In [15]:
from sagemaker.utils import name_from_base

model_name = name_from_base(f"Llama-2-7b-chat")
print(model_name)

create_model_response = sm_client.create_model(
    ModelName=model_name,
    ExecutionRoleArn=role,
    PrimaryContainer={"Image": inference_image_uri, "ModelDataUrl": s3_code_artifact},
)
model_arn = create_model_response["ModelArn"]

print(f"Created Model: {model_arn}")

Llama-2-7b-chat-2023-10-03-17-35-02-971
Created Model: arn:aws:sagemaker:us-west-2:757967535041:model/llama-2-7b-chat-2023-10-03-17-35-02-971


In [16]:
endpoint_config_name = f"{model_name}-config"
endpoint_name = f"{model_name}-endpoint"

endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "VariantName": "variant1",
            "ModelName": model_name,
            "InstanceType": "ml.g5.12xlarge",
            "InitialInstanceCount": 1,
            "ModelDataDownloadTimeoutInSeconds": 900,
            "ContainerStartupHealthCheckTimeoutInSeconds": 900,
        },
    ],
)
endpoint_config_response

{'EndpointConfigArn': 'arn:aws:sagemaker:us-west-2:757967535041:endpoint-config/llama-2-7b-chat-2023-10-03-17-35-02-971-config',
 'ResponseMetadata': {'RequestId': 'f5d75040-c9bd-4680-abe2-0ac4d1cc0bc2',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'f5d75040-c9bd-4680-abe2-0ac4d1cc0bc2',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '127',
   'date': 'Tue, 03 Oct 2023 17:35:08 GMT'},
  'RetryAttempts': 0}}

In [17]:
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=f"{endpoint_name}", EndpointConfigName=endpoint_config_name
)
print(f"Created Endpoint: {create_endpoint_response['EndpointArn']}")

Created Endpoint: arn:aws:sagemaker:us-west-2:757967535041:endpoint/llama-2-7b-chat-2023-10-03-17-35-02-971-endpoint


#### Wait for endpoint to be In-service. This can take a while, so please be patient

In [18]:
import time

resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: InService
Arn: arn:aws:sagemaker:us-west-2:757967535041:endpoint/llama-2-7b-chat-2023-10-03-17-35-02-971-endpoint
Status: InService


In [24]:
# endpoint_name = "Llama-2-7b-chat-2023-10-02-01-09-05-406-endpoint"

In [19]:
class LineIterator:
    """
    A helper class for parsing the byte stream input. 
    
    The output of the model will be in the following format:
    ```
    b'{"outputs": [" a"]}\n'
    b'{"outputs": [" challenging"]}\n'
    b'{"outputs": [" problem"]}\n'
    ...
    ```
    
    While usually each PayloadPart event from the event stream will contain a byte array 
    with a full json, this is not guaranteed and some of the json objects may be split across
    PayloadPart events. For example:
    ```
    {'PayloadPart': {'Bytes': b'{"outputs": '}}
    {'PayloadPart': {'Bytes': b'[" problem"]}\n'}}
    ```
    
    This class accounts for this by concatenating bytes written via the 'write' function
    and then exposing a method which will return lines (ending with a '\n' character) within
    the buffer via the 'scan_lines' function. It maintains the position of the last read 
    position to ensure that previous bytes are not exposed again. 
    """
    
    def __init__(self, stream):
        self.byte_iterator = iter(stream)
        self.buffer = io.BytesIO()
        self.read_pos = 0

    def __iter__(self):
        return self

    def __next__(self):
        while True:
            self.buffer.seek(self.read_pos)
            line = self.buffer.readline()
            if line and line[-1] == ord('\n'):
                self.read_pos += len(line)
                return line[:-1]
            try:
                chunk = next(self.byte_iterator)
            except StopIteration:
                if self.read_pos < self.buffer.getbuffer().nbytes:
                    continue
                raise
            if 'PayloadPart' not in chunk:
                print('Unknown event type:' + chunk)
                continue
            self.buffer.seek(0, io.SEEK_END)
            self.buffer.write(chunk['PayloadPart']['Bytes'])

In [20]:
payload = {
    "inputs":  
      "Human: Write me a long poem about Paris.\nAssistant: ",
   "parameters":{"max_new_tokens":256, "top_p":0.9, "temperature":0.6, "stop_squence": "Human"}
}

In [1]:
import boto3
import json
import io

sm_client = boto3.client("sagemaker-runtime")
resp = sm_client.invoke_endpoint_with_response_stream(EndpointName=endpoint_name,Body=json.dumps(payload).encode("utf-8"))
 
event_stream = resp['Body']

for line in LineIterator(event_stream):
    resp = json.loads(line)
    print(resp.get("outputs")[0], end=' ')

NameError: name 'endpoint_name' is not defined