# Prepare dependency packages

In [20]:
!pip3 install jsonpath_ng boto3 --upgrade --quiet

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
awscli 1.29.42 requires botocore==1.31.42, but you have botocore 1.31.63 which is incompatible.
awscli 1.29.42 requires s3transfer<0.7.0,>=0.6.0, but you have s3transfer 0.7.0 which is incompatible.[0m[31m
[0m

# Prompt examples

In [2]:
params_examples = [
    {
        "prompt": "Human: Hello there!\\nAssistant:",
        "max_new_tokens": 512,
        "repetition_penalty": 1.0,
        "return_full_text": False,
        "temperature": 0.7,
        "top_p": 0.3,
        "top_k": 1,
    },
    {
        "prompt": "Human: Hello there!\\nAssistant:",
        "max_new_tokens": 512,
        "repetition_penalty": 1.0,
        "return_full_text": False,
        "temperature": 0.7,
        "top_p": 0.3,
        "top_k": 1,
        "stream": True
    },
    {
        "prompt": "Hello there!",
        "max_new_tokens": 512,
        "repetition_penalty": 1.0,
        "return_full_text": False,
        "temperature": 0.7,
        "top_p": 0.3,
        "top_k": 1,
        "stop": ["\nUser", "endoftext"]
    }
]

# Bedrock

## Available models

In [3]:
import boto3

br = boto3.client(
    "bedrock",
    region_name="us-west-2"
)

for model in br.list_foundation_models()["modelSummaries"]:
    print(model["modelId"])

amazon.titan-tg1-large
amazon.titan-e1t-medium
amazon.titan-embed-g1-text-02
amazon.titan-text-express-v1
amazon.titan-embed-text-v1
stability.stable-diffusion-xl
stability.stable-diffusion-xl-v0
ai21.j2-grande-instruct
ai21.j2-jumbo-instruct
ai21.j2-mid
ai21.j2-mid-v1
ai21.j2-ultra
ai21.j2-ultra-v1
anthropic.claude-instant-v1
anthropic.claude-v1
anthropic.claude-v2
cohere.command-text-v14


## Invoke without streaming

In [4]:
from importlib import import_module

model_family = "bedrock"
model_names = [
    "amazon.titan-tg1-large",
    "ai21.j2-grande-instruct",
    "ai21.j2-jumbo-instruct",
    "ai21.j2-mid",
    "ai21.j2-mid-v1",
    "ai21.j2-ultra",
    "ai21.j2-ultra-v1",
    "anthropic.claude-instant-v1",
    "anthropic.claude-v1",
    "anthropic.claude-v2",
    "cohere.command-text-v14"
]

for model_name in model_names:
    invoke = import_module("handlers." + model_family).model(model_name).invoke
    print(f"model: {model_name} - TEXT: {invoke(params_examples[0])['generated_text']}")

model: amazon.titan-tg1-large - TEXT:  Hello, how can I help you today?
model: ai21.j2-grande-instruct - TEXT:  How can I assist you today?
I'm here to help you with your questions.
model: ai21.j2-jumbo-instruct - TEXT:  How can I assist you today?
I'm here to help you with any questions you may have. How can I assist you today?
model: ai21.j2-mid - TEXT:  How can I assist you today?
I'm here to help you with your questions.
model: ai21.j2-mid-v1 - TEXT:  How can I assist you today?
I'm here to help you with your questions.
model: ai21.j2-ultra - TEXT:  How can I assist you today?
I'm here to help you with any questions you may have. How can I assist you today?
model: ai21.j2-ultra-v1 - TEXT:  How can I assist you today?
I'm here to help you with any questions you may have. How can I assist you today?
model: anthropic.claude-instant-v1 - TEXT:  Hello!
model: anthropic.claude-v1 - TEXT:  Hello! My name is Claude.
model: anthropic.claude-v2 - TEXT:  Hello! Nice to meet you.
model: cohere

## Invoke with streaming

In [6]:
from importlib import import_module

model_family = "bedrock"
model_names = [
    # "amazon.titan-tg1-large",
    # "ai21.j2-grande-instruct",
    # "ai21.j2-jumbo-instruct",
    # "ai21.j2-mid",
    # "ai21.j2-mid-v1",
    # "ai21.j2-ultra",
    # "ai21.j2-ultra-v1",
    "anthropic.claude-instant-v1",
    "anthropic.claude-v1",
    "anthropic.claude-v2",
    "cohere.command-text-v14"
]
    
for model_name in model_names:
    invoke = import_module("handlers." + model_family).model(model_name).invoke_with_response_stream
    print()
    print(f"model: {model_name} - TEXT: ", end="")
    for i in invoke(params_examples[1]):
        if "generated_text"  in i and i["generated_text"] != "<EOS_TOKEN>":
            print(i['generated_text'], end="")


model: anthropic.claude-instant-v1 - TEXT:  Hello!
model: anthropic.claude-v1 - TEXT:  Hello! My name is Claude.
model: anthropic.claude-v2 - TEXT:  Hello! Nice to meet you.
model: cohere.command-text-v14 - TEXT:  Hello! How can I assist you today?

# SageMaker

## Deploy TGI endpoint

In [None]:
from sagemaker.model import Model
from sagemaker import get_execution_role
from sagemaker.huggingface import get_huggingface_llm_image_uri
from sagemaker.huggingface import HuggingFaceModel
# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  version="0.8.2"
)

role = get_execution_role()
hf_model_id = "tiiuae/falcon-7b-instruct" # model id from huggingface.co/models
model_name = hf_model_id.replace("/","-").replace(".","-")
endpoint_name = "test-sagemaker-02"
instance_type = "ml.g5.2xlarge" # instance type to use for deployment
number_of_gpus = 1 # number of gpus to use for inference and tensor parallelism
health_check_timeout = 900 # Increase the timeout for the health check to 5 minutes for downloading the model

llm_model = HuggingFaceModel(
      role=role,
      image_uri=llm_image,
      env={
        'HF_MODEL_ID': hf_model_id,
        # 'HF_MODEL_QUANTIZE': "bitsandbytes", # comment in to quantize
        'SM_NUM_GPUS': f"{number_of_gpus}",
        'MAX_INPUT_LENGTH': "3000",  # Max length of input text
        'MAX_TOTAL_TOKENS': "6000",  # Max length of the generation (including input text)
        'HF_MODEL_REVISION': 'eb410fb6ffa9028e97adb801f0d6ec46d02f8b07'
      },
      name=model_name
    )

llm = llm_model.deploy(
  initial_instance_count=1,
  instance_type=instance_type,
  container_startup_health_check_timeout=health_check_timeout,
  endpoint_name=endpoint_name,
)

## Invoke with no stream

In [7]:
from importlib import import_module

model_family = "sagemaker"
model_name = "tgi.test-sagemaker-02"

invoke = import_module("handlers." + model_family).model(model_name).invoke
print(f"model: {model_name} - TEXT: {invoke(params_examples[0])['generated_text']}")

model: tgi.test-sagemaker-02 - TEXT:  Hi there! How can I help you today?
User 


## Invoke with stream

In [3]:
from importlib import import_module

model_family = "sagemaker"
model_name = "tgi.test-sagemaker-02"

invoke = import_module("handlers." + model_family).model(model_name).invoke_with_response_stream
print()
print(f"model: {model_name} - TEXT: ", end="")
for i in invoke(params_examples[1]):
    if "generated_text"  in i and i["generated_text"] != "<EOS_TOKEN>":
        print(i['generated_text'], end="")


model: tgi.test-sagemaker-02 - TEXT:  Hi Hi there there!! How How can can I I help help you you today today??

UserUser  <|endoftext|><|endoftext|>

In [2]:
!docker build --quiet --tag api-layer:latest .

sha256:e2a26379348c5f0dfee0dbefeb5b9ecdd2f62b776ee365df919857b9ddc8f0c4


In [23]:
!docker run --name test-api-layer -d -p 8001:8001 -e AWS_REGION=ap-southeast-2 -e AWS_DEFAULT_REGION=ap-southeast-2 api-layer:latest --host 0.0.0.0 --port 8001

5cf20457022ca5e25adc56975953516ff9bc3c701b269053087b073d16362d9b


In [24]:
!docker exec test-api-layer printenv

PATH=/usr/local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
HOSTNAME=5cf20457022c
AWS_REGION=ap-southeast-2
AWS_DEFAULT_REGION=ap-southeast-2
LANG=C.UTF-8
GPG_KEY=A035C8C19219BA821ECEA86B64E628F8D684696D
PYTHON_VERSION=3.10.13
PYTHON_PIP_VERSION=23.0.1
PYTHON_SETUPTOOLS_VERSION=65.5.1
PYTHON_GET_PIP_URL=https://github.com/pypa/get-pip/raw/9af82b715db434abb94a0a6f3569f43e72157346/public/get-pip.py
PYTHON_GET_PIP_SHA256=45a2bb8bf2bb5eff16fdd00faef6f29731831c7c59bd9fc2bf1f3bed511ff1fe
HOME=/root


In [22]:
!docker kill test-api-layer && docker rm test-api-layer

test-api-layer
test-api-layer


In [67]:
!docker rm test-api-layer

test-api-layer


In [25]:
!docker logs test-api-layer

INFO:     Started server process [1]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8001 (Press CTRL+C to quit)


In [3]:
!docker context ls

NAME                              DESCRIPTION                               DOCKER ENDPOINT                                               ERROR
default                           Current DOCKER_HOST based configuration   unix:///var/run/docker.sock                                   
m5.xlarge_i-03dfd58f044bc8079 *                                             tcp://ip-172-31-67-109.ap-southeast-2.compute.internal:1111   


In [4]:
host_ip = "ip-172-31-67-109.ap-southeast-2.compute.internal"

In [5]:
%%bash -s $host_ip
curl -X POST ${1}:8001/invoke -d '{"body":{"prompt": "Human: Hello, write a poem about a young girl named Zee\nAssistant:"}, "model_family": "bedrock", "model_name": "anthropic.claude-v2"}'

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   868  100   714  100   154     53     11  0:00:14  0:00:13  0:00:01   177


{"generated_text":" Here is a poem about a young girl named Zee:\n\nZee, the girl with eyes so bright\nFull of spirit, full of light\nHair flowing free in the breeze\nLaughing, playing with ease\nAdventurous, fun and free\nFilled with joy for all to see\nOn the swings, she loves to fly\nReaching up to touch the sky  \nWith imagination unbound\nNew worlds and stories can be found\nIn her mind, where fairies dwell\nAnd magic weaves a happy spell\nZee lives each day with wonder and glee\nThe world's a playground, just wait and see\nA smile, a laugh, she spreads them around\nTo all she meets, joy does abound\nHer spirit shines for all to see\nZee, a girl full of life's beauty","finish_reason":"stop_sequence"}

In [10]:
%%bash -s $host_ip
curl -X POST ${1}:8001/invoke_stream -d '{"body":{"prompt": "Human: Hello, write a poem about a young girl named Zee\nAssistant:"}, "model_family": "bedrock", "model_name": "anthropic.claude-v2"}'

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  2008    0  1854  100   154     76      6  0:00:25  0:00:24  0:00:01   260


{"generated_text": " Here", "finish_reason": null}
{"generated_text": " is a poem about a", "finish_reason": null}
{"generated_text": " young girl named Zee", "finish_reason": null}
{"generated_text": ":\n\nZee,", "finish_reason": null}
{"generated_text": " the", "finish_reason": null}
{"generated_text": " girl with", "finish_reason": null}
{"generated_text": " eyes so", "finish_reason": null}
{"generated_text": " bright", "finish_reason": null}
{"generated_text": "\nFull", "finish_reason": null}
{"generated_text": " of spirit, full of", "finish_reason": null}
{"generated_text": " light\nHair flowing free in", "finish_reason": null}
{"generated_text": " the breeze\nLaughing", "finish_reason": null}
{"generated_text": ", playing with ease", "finish_reason": null}
{"generated_text": "\nAdventurous, fun", "finish_reason": null}
{"generated_text": " and free\nFilled", "finish_reason": null}
{"generated_text": " with joy for all to see", "finish_reason": null}
{"generated_text": "\nOn the s

In [11]:
CODE_INTERPRETER_SYSTEM_PROMPT = """You are a helpful AI assistant.

You have access to a python code interpreter, which supports you in your tasks.
The code is executed in an interactive shell, imports and variables are preserved between calls.
The environment has internet and file system access.
The current working directory is shared with the user, so files can be exchanged.
There are many libraries pre-installed, including numpy, pandas, matplotlib, and scikit-learn.
You cannot show rich outputs like plots or images, but you can store them in the working directory and point the user to them.
If the code runs too long, there will be a timeout.

To access the interpreter, use the following format:
```python
<your code>
```
If you want to call Python and still say something, do only output text above the code block, NOT below.
Only provide at most one code block per message.
The code will be executed automatically and the result will be sent back to you
"""

ROLES = ["Human", "Assistant"]

In [12]:
import os

os.chdir("app")

In [13]:
from importlib import import_module

model_family = "bedrock"
model_name = "anthropic.claude-v2"

text = "what is 52th fibonacci number?"

prompt = ROLES[0] + ": " + CODE_INTERPRETER_SYSTEM_PROMPT + " QUESTION: " + text + "/n" + ROLES[1] + ":"

invoke = import_module("handlers." + model_family).model(model_name).invoke_with_response_stream
print()
print(f"model: {model_name} - TEXT: ", end="")
for i in invoke({"prompt": prompt}):
    if "generated_text"  in i and i["generated_text"] != "<EOS_TOKEN>":
        print(i['generated_text'], end="")

ModuleNotFoundError: No module named 'jsonpath_ng'

In [14]:
data = '{"body": {"prompt": "' + prompt +'"}, "model_family": "' + model_family + '", "model_name": "' + model_name + '"}'
data = data.replace("\n", "\\n")
# print(data.encode())

In [15]:
import json

json.loads(data)

{'body': {'prompt': 'Human: You are a helpful AI assistant.\n\nYou have access to a python code interpreter, which supports you in your tasks.\nThe code is executed in an interactive shell, imports and variables are preserved between calls.\nThe environment has internet and file system access.\nThe current working directory is shared with the user, so files can be exchanged.\nThere are many libraries pre-installed, including numpy, pandas, matplotlib, and scikit-learn.\nYou cannot show rich outputs like plots or images, but you can store them in the working directory and point the user to them.\nIf the code runs too long, there will be a timeout.\n\nTo access the interpreter, use the following format:\n```python\n<your code>\n```\nIf you want to call Python and still say something, do only output text above the code block, NOT below.\nOnly provide at most one code block per message.\nThe code will be executed automatically and the result will be sent back to you\n QUESTION: what is 52t

In [18]:
import requests

host_url = f"http://{host_ip}:8001/invoke_stream"

def iter_func(result):
        for chunk in result.iter_lines():
            yield json.loads(chunk)["generated_text"]

res = requests.post(
    url=host_url,
    data=data,
    stream=True
)
for chunk in iter_func(res):
    print(chunk, end="")

 Here is the code to get the 52nd Fibonacci number:

```python
a, b = 0, 1
for i in range(50):
    a, b = b, a + b
print(a)
```

The 52nd Fibonacci number is: 806515533049393

In [43]:
api_layer_url = f"http://{host_ip}:8001/invoke"

In [57]:
stream = True
print(host_url)
print(api_layer_url + ("" if not stream else "_stream"))

http://ip-172-31-75-227.ap-southeast-2.compute.internal:8001/invoke_stream
http://ip-172-31-75-227.ap-southeast-2.compute.internal:8001/invoke_stream


In [108]:
def send_req_to_agent(text, model_family, model_name, stream=False):
    def iter_func(res):
        for chunk in res.iter_lines():
            # chunck_dict = json.loads(chunk)
            # yield json.loads(chunk)["generated_text"]
            chunck_dict = json.loads(chunk)
            yield chunck_dict["generated_text"]
        yield "|STOP|"
    
    data = {
        "body": {
            "prompt": text
        }, 
        "model_family": model_family, 
        "model_name": model_name
    }
    ret = requests.post(
        url=api_layer_url + ("" if not stream else "_stream"), 
        data=json.dumps(data),
        stream=stream
    )
    if stream:
        return iter_func(ret)
    else:
        return json.loads(ret.text)["generated_text"]

In [106]:
send_req_to_agent(prompt, model_family, model_name, stream=False)

' Here is the code to get the 52nd Fibonacci number:\n\n```python\na, b = 0, 1\nfor i in range(50):\n    a, b = b, a + b\nprint(a)\n```\n\nThe 52nd Fibonacci number is: 806515533049393'

In [109]:
for text in send_req_to_agent(prompt, model_family, model_name, stream=True):
    # print(text, end="")
    if not text == "|STOP|":
        print(text, end="")
    else:
        print("\n\n Stream ended")

 Here is the code to get the 52nd Fibonacci number:

```python
a, b = 0, 1
for i in range(50):
    a, b = b, a + b
print(a)
```

The 52nd Fibonacci number is: 806515533049393

 Stream ended


In [5]:
!pip install sagemaker -U -q

[0m

In [32]:
import json
import sagemaker
import boto3
from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri

try:
	role = sagemaker.get_execution_role()
except ValueError:
	iam = boto3.client('iam')
	role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

# Hub Model configuration. https://huggingface.co/models
hub = {
	'HF_MODEL_ID':'codellama/CodeLlama-13b-instruct-hf',
	'SM_NUM_GPUS': '4'
}

huggingface_model = HuggingFaceModel(
	image_uri=get_huggingface_llm_image_uri("huggingface",version="1.1.0"),
	env=hub,
	role=role, 
)

predictor = huggingface_model.deploy(
	initial_instance_count=1,
	instance_type="ml.g5.12xlarge",
	container_startup_health_check_timeout=300,
    endpoint_name="code-llama-13b-instruct-endpoint"
  )

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
------------!

In [35]:
%%bash -s $host_ip
curl -X POST ${1}:8001/invoke -d '{"body":{"prompt": "Human: Write a script to list all my Amazon SageMaker models\nAssistant:"}, "model_family": "sagemaker", "model_name": "tgi.code-llama-13b-instruct-endpoint"}'

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   323  100   145  100   178    263    323 --:--:-- --:--:-- --:--:--   586


{"finish_reason":"length","generated_tokens":20,"seed":null,"generated_text":" Here is a list of all your Amazon SageMaker models:\n\n1. Model1"}