# Prepare dependency packages

In [2]:
!pip3 install jsonpath_ng boto3 --upgrade --quiet

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
awscli 1.29.42 requires botocore==1.31.42, but you have botocore 1.31.62 which is incompatible.
awscli 1.29.42 requires s3transfer<0.7.0,>=0.6.0, but you have s3transfer 0.7.0 which is incompatible.[0m[31m
[0m

# Prompt examples

In [2]:
params_examples = [
    {
        "prompt": "Human: Hello there!\\nAssistant:",
        "max_new_tokens": 512,
        "repetition_penalty": 1.0,
        "return_full_text": False,
        "temperature": 0.7,
        "top_p": 0.3,
        "top_k": 1,
    },
    {
        "prompt": "Human: Hello there!\\nAssistant:",
        "max_new_tokens": 512,
        "repetition_penalty": 1.0,
        "return_full_text": False,
        "temperature": 0.7,
        "top_p": 0.3,
        "top_k": 1,
        "stream": True
    },
    {
        "prompt": "Hello there!",
        "max_new_tokens": 512,
        "repetition_penalty": 1.0,
        "return_full_text": False,
        "temperature": 0.7,
        "top_p": 0.3,
        "top_k": 1,
        "stop": ["\nUser", "endoftext"]
    }
]

# Bedrock

## Available models

In [3]:
import boto3

br = boto3.client(
    "bedrock",
    region_name="us-west-2"
)

for model in br.list_foundation_models()["modelSummaries"]:
    print(model["modelId"])

amazon.titan-tg1-large
amazon.titan-e1t-medium
amazon.titan-embed-g1-text-02
amazon.titan-text-express-v1
amazon.titan-embed-text-v1
stability.stable-diffusion-xl
stability.stable-diffusion-xl-v0
ai21.j2-grande-instruct
ai21.j2-jumbo-instruct
ai21.j2-mid
ai21.j2-mid-v1
ai21.j2-ultra
ai21.j2-ultra-v1
anthropic.claude-instant-v1
anthropic.claude-v1
anthropic.claude-v2
cohere.command-text-v14


## Invoke without streaming

In [4]:
from importlib import import_module

model_family = "bedrock"
model_names = [
    "amazon.titan-tg1-large",
    "ai21.j2-grande-instruct",
    "ai21.j2-jumbo-instruct",
    "ai21.j2-mid",
    "ai21.j2-mid-v1",
    "ai21.j2-ultra",
    "ai21.j2-ultra-v1",
    "anthropic.claude-instant-v1",
    "anthropic.claude-v1",
    "anthropic.claude-v2",
    "cohere.command-text-v14"
]

for model_name in model_names:
    invoke = import_module("handlers." + model_family).model(model_name).invoke
    print(f"model: {model_name} - TEXT: {invoke(params_examples[0])['generated_text']}")

model: amazon.titan-tg1-large - TEXT:  Hello, how can I help you today?
model: ai21.j2-grande-instruct - TEXT:  How can I assist you today?
I'm here to help you with your questions.
model: ai21.j2-jumbo-instruct - TEXT:  How can I assist you today?
I'm here to help you with any questions you may have. How can I assist you today?
model: ai21.j2-mid - TEXT:  How can I assist you today?
I'm here to help you with your questions.
model: ai21.j2-mid-v1 - TEXT:  How can I assist you today?
I'm here to help you with your questions.
model: ai21.j2-ultra - TEXT:  How can I assist you today?
I'm here to help you with any questions you may have. How can I assist you today?
model: ai21.j2-ultra-v1 - TEXT:  How can I assist you today?
I'm here to help you with any questions you may have. How can I assist you today?
model: anthropic.claude-instant-v1 - TEXT:  Hello!
model: anthropic.claude-v1 - TEXT:  Hello! My name is Claude.
model: anthropic.claude-v2 - TEXT:  Hello! Nice to meet you.
model: cohere

## Invoke with streaming

In [6]:
from importlib import import_module

model_family = "bedrock"
model_names = [
    # "amazon.titan-tg1-large",
    # "ai21.j2-grande-instruct",
    # "ai21.j2-jumbo-instruct",
    # "ai21.j2-mid",
    # "ai21.j2-mid-v1",
    # "ai21.j2-ultra",
    # "ai21.j2-ultra-v1",
    "anthropic.claude-instant-v1",
    "anthropic.claude-v1",
    "anthropic.claude-v2",
    "cohere.command-text-v14"
]
    
for model_name in model_names:
    invoke = import_module("handlers." + model_family).model(model_name).invoke_with_response_stream
    print()
    print(f"model: {model_name} - TEXT: ", end="")
    for i in invoke(params_examples[1]):
        if "generated_text"  in i and i["generated_text"] != "<EOS_TOKEN>":
            print(i['generated_text'], end="")


model: anthropic.claude-instant-v1 - TEXT:  Hello!
model: anthropic.claude-v1 - TEXT:  Hello! My name is Claude.
model: anthropic.claude-v2 - TEXT:  Hello! Nice to meet you.
model: cohere.command-text-v14 - TEXT:  Hello! How can I assist you today?

# SageMaker

## Deploy TGI endpoint

In [None]:
from sagemaker.model import Model
from sagemaker import get_execution_role
from sagemaker.huggingface import get_huggingface_llm_image_uri
from sagemaker.huggingface import HuggingFaceModel
# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  version="0.8.2"
)

role = get_execution_role()
hf_model_id = "tiiuae/falcon-7b-instruct" # model id from huggingface.co/models
model_name = hf_model_id.replace("/","-").replace(".","-")
endpoint_name = "test-sagemaker-02"
instance_type = "ml.g5.2xlarge" # instance type to use for deployment
number_of_gpus = 1 # number of gpus to use for inference and tensor parallelism
health_check_timeout = 900 # Increase the timeout for the health check to 5 minutes for downloading the model

llm_model = HuggingFaceModel(
      role=role,
      image_uri=llm_image,
      env={
        'HF_MODEL_ID': hf_model_id,
        # 'HF_MODEL_QUANTIZE': "bitsandbytes", # comment in to quantize
        'SM_NUM_GPUS': f"{number_of_gpus}",
        'MAX_INPUT_LENGTH': "3000",  # Max length of input text
        'MAX_TOTAL_TOKENS': "6000",  # Max length of the generation (including input text)
        'HF_MODEL_REVISION': 'eb410fb6ffa9028e97adb801f0d6ec46d02f8b07'
      },
      name=model_name
    )

llm = llm_model.deploy(
  initial_instance_count=1,
  instance_type=instance_type,
  container_startup_health_check_timeout=health_check_timeout,
  endpoint_name=endpoint_name,
)

## Invoke with no stream

In [7]:
from importlib import import_module

model_family = "sagemaker"
model_name = "tgi.test-sagemaker-02"

invoke = import_module("handlers." + model_family).model(model_name).invoke
print(f"model: {model_name} - TEXT: {invoke(params_examples[0])['generated_text']}")

model: tgi.test-sagemaker-02 - TEXT:  Hi there! How can I help you today?
User 


## Invoke with stream

In [3]:
from importlib import import_module

model_family = "sagemaker"
model_name = "tgi.test-sagemaker-02"

invoke = import_module("handlers." + model_family).model(model_name).invoke_with_response_stream
print()
print(f"model: {model_name} - TEXT: ", end="")
for i in invoke(params_examples[1]):
    if "generated_text"  in i and i["generated_text"] != "<EOS_TOKEN>":
        print(i['generated_text'], end="")


model: tgi.test-sagemaker-02 - TEXT:  Hi Hi there there!! How How can can I I help help you you today today??

UserUser  <|endoftext|><|endoftext|>

In [2]:
!docker build --tag api-layer:latest .

[1A[1B[0G[?25l[+] Building 0.0s (0/0)                     docker:m5.large_i-02880917500cf72a3
[?25h[1A[0G[?25l[+] Building 0.0s (0/0)                     docker:m5.large_i-02880917500cf72a3
[?25h[1A[0G[?25l[+] Building 0.0s (0/0)                     docker:m5.large_i-02880917500cf72a3
[?25h[1A[0G[?25l[+] Building 0.1s (2/3)                     docker:m5.large_i-02880917500cf72a3
[34m => [internal] load .dockerignore                                          0.1s
[0m[34m => => transferring context: 2B                                            0.0s
[0m[34m => [internal] load build definition from Dockerfile                       0.1s
[0m[34m => => transferring dockerfile: 310B                                       0.0s
[0m => [internal] load metadata for docker.io/library/python:3.10-slim        0.1s
[?25h[1A[1A[1A[1A[1A[1A[0G[?25l[+] Building 0.3s (2/3)                     docker:m5.large_i-02880917500cf72a3
[34m => [internal] load .dockerignore        

In [3]:
!docker run --name test-api-layer -d -p 8001:8001 api-layer:latest --host 0.0.0.0 --port 8001

8b482c92524e9c507a406a64e47a499adbba782cf90dd1407943e2cd227cc7ee


In [80]:
!docker kill test-api-layer && docker rm test-api-layer

test-api-layer
test-api-layer


In [67]:
!docker rm test-api-layer

test-api-layer


In [7]:
!docker logs test-api-layer

INFO:     Started server process [1]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8001 (Press CTRL+C to quit)
INFO:     172.31.67.48:58412 - "POST /invoke HTTP/1.1" 500 Internal Server Error
ERROR:    Exception in ASGI application
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/site-packages/uvicorn/protocols/http/h11_impl.py", line 408, in run_asgi
    result = await app(  # type: ignore[func-returns-value]
  File "/usr/local/lib/python3.10/site-packages/uvicorn/middleware/proxy_headers.py", line 84, in __call__
    return await self.app(scope, receive, send)
  File "/usr/local/lib/python3.10/site-packages/fastapi/applications.py", line 292, in __call__
    await super().__call__(scope, receive, send)
  File "/usr/local/lib/python3.10/site-packages/starlette/applications.py", line 122, in __call__
    await self.middleware_stack(scope, receive, send)
  File "/usr/local/lib/python3.10

In [15]:
!docker context ls

NAME                             DESCRIPTION                               DOCKER ENDPOINT                                               ERROR
default                          Current DOCKER_HOST based configuration   unix:///var/run/docker.sock                                   
m5.large_i-061b682239d3aedb5 *                                             tcp://ip-172-31-65-185.ap-southeast-2.compute.internal:1111   


In [87]:
!curl -X POST ip-172-31-65-185.ap-southeast-2.compute.internal:8001/invoke -d '{"body":{"prompt": "Human: Hello, write a poem about a young girl named Zee\nAssistant:"}, "model_family": "bedrock", "model_name": "anthropic.claude-v2"}'

{"generated_text":" Here is a poem about a young girl named Zee:\n\nZee, the girl with eyes so bright\nFull of spirit, full of light\nHair flowing free in the breeze\nLaughing, playing with ease\nAdventurous, fun and free\nFilled with joy for all to see\nOn the swings, she loves to fly\nReaching up to touch the sky  \nWith imagination unbound\nNew worlds and stories can be found\nIn her mind, where fairies dwell\nAnd magic weaves a happy spell\nZee lives each day with wonder and glee\nThe world's a playground, just wait and see\nA smile, a laugh, she spreads them around\nTo all she meets, joy does abound\nHer spirit shines for all to see\nZee, a girl full of life's beauty","finish_reason":"stop_sequence"}

In [88]:
!curl -X POST ip-172-31-65-185.ap-southeast-2.compute.internal:8001/invoke_stream -d '{"body":{"prompt": "Human: Hello, write a poem about a young girl named Zee\nAssistant:"}, "model_family": "bedrock", "model_name": "anthropic.claude-v2"}'

{"generated_text": " Here", "finish_reason": null}
{"generated_text": " is a poem about a young girl named Zee:\n\nZee, the", "finish_reason": null}
{"generated_text": " girl with eyes so bright", "finish_reason": null}
{"generated_text": "\nFull of spirit,", "finish_reason": null}
{"generated_text": " full of light\nHair flowing free in", "finish_reason": null}
{"generated_text": " the breeze\nLaughing, playing", "finish_reason": null}
{"generated_text": " with ease\nAdvent", "finish_reason": null}
{"generated_text": "urous, fun and free\nFilled", "finish_reason": null}
{"generated_text": " with joy for all to see\nOn the", "finish_reason": null}
{"generated_text": " swings, she loves to", "finish_reason": null}
{"generated_text": " fly\nReaching up to touch", "finish_reason": null}
{"generated_text": " the sky  \nWith imagination un", "finish_reason": null}
{"generated_text": "bound\nNew worlds and stories can", "finish_reason": null}
{"generated_text": " be found\nIn her mind, where

In [19]:
CODE_INTERPRETER_SYSTEM_PROMPT = """You are a helpful AI assistant.

You have access to a python code interpreter, which supports you in your tasks.
The code is executed in an interactive shell, imports and variables are preserved between calls.
The environment has internet and file system access.
The current working directory is shared with the user, so files can be exchanged.
There are many libraries pre-installed, including numpy, pandas, matplotlib, and scikit-learn.
You cannot show rich outputs like plots or images, but you can store them in the working directory and point the user to them.
If the code runs too long, there will be a timeout.

To access the interpreter, use the following format:
```python
<your code>
```
If you want to call Python and still say something, do only output text above the code block, NOT below.
Only provide at most one code block per message.
The code will be executed automatically and the result will be sent back to you
"""

ROLES = ["Human", "Assistant"]

In [20]:
model_family = "bedrock"
model_name = "anthropic.claude-v2"

text = "what is 52th fibonacci number?"

prompt = ROLES[0] + ": " + CODE_INTERPRETER_SYSTEM_PROMPT + " QUESTION: " + text + "/n" + ROLES[1] + ":"

invoke = import_module("handlers." + model_family).model(model_name).invoke_with_response_stream
print()
print(f"model: {model_name} - TEXT: ", end="")
for i in invoke({"prompt": prompt}):
    if "generated_text"  in i and i["generated_text"] != "<EOS_TOKEN>":
        print(i['generated_text'], end="")


model: anthropic.claude-v2 - TEXT:  Here is the code to get the 52nd Fibonacci number:

```python
a, b = 0, 1
for i in range(52):
    a, b = b, a + b
print(a)
```

The 52nd Fibonacci number is: 806515533049393