# Prepare dependency packages

In [55]:
!pip3 install jsonpath-ng boto3 --upgrade --quiet

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
awscli 1.29.63 requires botocore==1.31.63, but you have botocore 1.31.74 which is incompatible.[0m[31m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


# Prompt examples

In [2]:
params_examples = [
    {
        "prompt": "Human: Hello there!\\nAssistant:",
        "max_new_tokens": 512,
        "repetition_penalty": 1.0,
        "return_full_text": False,
        "temperature": 0.7,
        "top_p": 0.3,
        "top_k": 1,
    },
    {
        "prompt": "Human: Hello there!\\nAssistant:",
        "max_new_tokens": 512,
        "repetition_penalty": 1.0,
        "return_full_text": False,
        "temperature": 0.7,
        "top_p": 0.3,
        "top_k": 1,
        "stream": True
    },
    {
        "prompt": "Hello there!",
        "max_new_tokens": 512,
        "repetition_penalty": 1.0,
        "return_full_text": False,
        "temperature": 0.7,
        "top_p": 0.3,
        "top_k": 1,
        "stop": ["\nUser", "endoftext"]
    }
]

# Bedrock

## Available models

In [3]:
import boto3

br = boto3.client(
    "bedrock",
    region_name="us-west-2"
)

for model in br.list_foundation_models()["modelSummaries"]:
    print(model)

amazon.titan-tg1-large
amazon.titan-e1t-medium
amazon.titan-embed-g1-text-02
amazon.titan-text-express-v1
amazon.titan-embed-text-v1
stability.stable-diffusion-xl
stability.stable-diffusion-xl-v0
ai21.j2-grande-instruct
ai21.j2-jumbo-instruct
ai21.j2-mid
ai21.j2-mid-v1
ai21.j2-ultra
ai21.j2-ultra-v1
anthropic.claude-instant-v1
anthropic.claude-v1
anthropic.claude-v2
cohere.command-text-v14


## Invoke without streaming

In [4]:
from importlib import import_module

model_family = "bedrock"
model_names = [
    "amazon.titan-tg1-large",
    "ai21.j2-grande-instruct",
    "ai21.j2-jumbo-instruct",
    "ai21.j2-mid",
    "ai21.j2-mid-v1",
    "ai21.j2-ultra",
    "ai21.j2-ultra-v1",
    "anthropic.claude-instant-v1",
    "anthropic.claude-v1",
    "anthropic.claude-v2",
    "cohere.command-text-v14"
]

for model_name in model_names:
    invoke = import_module("handlers." + model_family).model(model_name).invoke
    print(f"model: {model_name} - TEXT: {invoke(params_examples[0])['generated_text']}")

model: amazon.titan-tg1-large - TEXT:  Hello, how can I help you today?
model: ai21.j2-grande-instruct - TEXT:  How can I assist you today?
I'm here to help you with your questions.
model: ai21.j2-jumbo-instruct - TEXT:  How can I assist you today?
I'm here to help you with any questions you may have. How can I assist you today?
model: ai21.j2-mid - TEXT:  How can I assist you today?
I'm here to help you with your questions.
model: ai21.j2-mid-v1 - TEXT:  How can I assist you today?
I'm here to help you with your questions.
model: ai21.j2-ultra - TEXT:  How can I assist you today?
I'm here to help you with any questions you may have. How can I assist you today?
model: ai21.j2-ultra-v1 - TEXT:  How can I assist you today?
I'm here to help you with any questions you may have. How can I assist you today?
model: anthropic.claude-instant-v1 - TEXT:  Hello!
model: anthropic.claude-v1 - TEXT:  Hello! My name is Claude.
model: anthropic.claude-v2 - TEXT:  Hello! Nice to meet you.
model: cohere

## Invoke with streaming

In [16]:
from importlib import import_module

model_family = "bedrock"
model_names = [
    "amazon.titan-tg1-large",
    # "ai21.j2-grande-instruct",
    # "ai21.j2-jumbo-instruct",
    # "ai21.j2-mid",
    # "ai21.j2-mid-v1",
    # "ai21.j2-ultra",
    # "ai21.j2-ultra-v1",
    "anthropic.claude-instant-v1",
    "anthropic.claude-v1",
    "anthropic.claude-v2",
    "cohere.command-text-v14"
]
    
for model_name in model_names:
    invoke = import_module("handlers." + model_family).model(model_name).invoke_with_response_stream
    print()
    print(f"model: {model_name} - TEXT: ", end="")
    for i in invoke(params_examples[1]):
        if "generated_text"  in i and i["generated_text"] != "<EOS_TOKEN>":
            print(i['generated_text'], end="")

ModuleNotFoundError: No module named 'handlers'

# SageMaker

## Deploy TGI endpoint

In [None]:
from sagemaker.model import Model
from sagemaker import get_execution_role
from sagemaker.huggingface import get_huggingface_llm_image_uri
from sagemaker.huggingface import HuggingFaceModel
# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  version="0.8.2"
)

role = get_execution_role()
hf_model_id = "tiiuae/falcon-7b-instruct" # model id from huggingface.co/models
model_name = hf_model_id.replace("/","-").replace(".","-")
endpoint_name = "test-sagemaker-02"
instance_type = "ml.g5.2xlarge" # instance type to use for deployment
number_of_gpus = 1 # number of gpus to use for inference and tensor parallelism
health_check_timeout = 900 # Increase the timeout for the health check to 5 minutes for downloading the model

llm_model = HuggingFaceModel(
      role=role,
      image_uri=llm_image,
      env={
        'HF_MODEL_ID': hf_model_id,
        # 'HF_MODEL_QUANTIZE': "bitsandbytes", # comment in to quantize
        'SM_NUM_GPUS': f"{number_of_gpus}",
        'MAX_INPUT_LENGTH': "3000",  # Max length of input text
        'MAX_TOTAL_TOKENS': "6000",  # Max length of the generation (including input text)
        'HF_MODEL_REVISION': 'eb410fb6ffa9028e97adb801f0d6ec46d02f8b07'
      },
      name=model_name
    )

llm = llm_model.deploy(
  initial_instance_count=1,
  instance_type=instance_type,
  container_startup_health_check_timeout=health_check_timeout,
  endpoint_name=endpoint_name,
)

## Invoke with no stream

In [7]:
from importlib import import_module

model_family = "sagemaker"
model_name = "tgi.test-sagemaker-02"

invoke = import_module("handlers." + model_family).model(model_name).invoke
print(f"model: {model_name} - TEXT: {invoke(params_examples[0])['generated_text']}")

model: tgi.test-sagemaker-02 - TEXT:  Hi there! How can I help you today?
User 


## Invoke with stream

In [3]:
from importlib import import_module

model_family = "sagemaker"
model_name = "tgi.test-sagemaker-02"

invoke = import_module("handlers." + model_family).model(model_name).invoke_with_response_stream
print()
print(f"model: {model_name} - TEXT: ", end="")
for i in invoke(params_examples[1]):
    if "generated_text"  in i and i["generated_text"] != "<EOS_TOKEN>":
        print(i['generated_text'], end="")


model: tgi.test-sagemaker-02 - TEXT:  Hi Hi there there!! How How can can I I help help you you today today??

UserUser  <|endoftext|><|endoftext|>

In [165]:
!docker build --quiet --tag api-layer:latest .

sha256:9264d66dcc830c404c29261428229ab669795009554b8f6781d6491e50a27c67


In [159]:
!docker run --name test-api-layer -d -p 8001:8001 -e AWS_REGION=us-west-2 -e AWS_DEFAULT_REGION=us-west-2 -e APP_LOG_LEVEL="debug" api-layer:latest --host 0.0.0.0 --port 8001 --table-name model-db

87e6f5d89e2cbb02b281a23cfc9dbbc4f742ed78e8f98e2b3510cfd2b533cbdb


In [140]:
!docker run --name test-api-layer -d -p 8001:8001 -e AWS_REGION=us-west-2 -e AWS_DEFAULT_REGION=us-west-2 -e APP_LOG_LEVEL="debug" -e APP_DDB_TABLE="model-db" -e APP_HOST="0.0.0.0" -e APP_PORT="8001" -e CW_NAMESPACE="Codenator/api-layer/" -e CW_METRIC_NAME="active-workers" -e APP_WORKERS=3 api-layer:latest

ca76c6fdc3b36b283667b3aa2afebf7f85bbca36b054905a0bcea256890f69b3


In [157]:
!docker kill test-api-layer && docker rm test-api-layer

test-api-layer
test-api-layer


In [138]:
!docker rm test-api-layer

test-api-layer


In [161]:
!docker logs test-api-layer

args: Namespace(host='0.0.0.0', port=8001, table_name='model-db', workers=3, namespace='Codenator/api-layer/', metric_name='active-workers')
INFO:     Uvicorn running on http://0.0.0.0:8001 (Press CTRL+C to quit)
INFO:     Started parent process [1]
INFO:     Started server process [10]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Started server process [8]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Started server process [9]
INFO:     Waiting for application startup.
INFO:     Application startup complete.


In [61]:
!docker context ls

NAME                             DESCRIPTION                               DOCKER ENDPOINT                                       ERROR
default                          Current DOCKER_HOST based configuration   unix:///var/run/docker.sock                           
m5.large_i-055aa5769c23350ff *                                             tcp://ip-10-3-1-251.us-west-2.compute.internal:1111   


In [19]:
host_ip = "ip-10-3-1-251.us-west-2.compute.internal"

In [142]:
%%bash -s $host_ip
curl -X POST -H 'Content-Type: application/json' ${1}:8001/invoke_stream -d '{"body": {"prompt": "Human: Hello\nAssistant:"}, "model_family": "bedrock", "model_name": "cohere.command-text-v14"}'

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   383    0   267  100   116    432    187 --:--:-- --:--:-- --:--:--   620


{"generated_text": " Hi"}
{"generated_text": "!"}
{"generated_text": " How"}
{"generated_text": " can"}
{"generated_text": " I"}
{"generated_text": " help"}
{"generated_text": " you"}
{"generated_text": " today"}
{"generated_text": "?"}
{"finish_reason": "COMPLETE"}


In [80]:
%%bash -s $host_ip
curl ${1}:8001/list_models

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   760  100   760    0     0  53988      0 --:--:-- --:--:-- --:--:-- 54285


{"error":"An error occurred: An error occurred (ResourceNotFoundException) when calling the GetItem operation: Requested resource not found","stacktrace":"Traceback (most recent call last):\n  File \"/code/app/main.py\", line 30, in list_models\n    all_models = base.ddb_client.get_item(\n  File \"/usr/local/lib/python3.10/site-packages/botocore/client.py\", line 535, in _api_call\n    return self._make_api_call(operation_name, kwargs)\n  File \"/usr/local/lib/python3.10/site-packages/botocore/client.py\", line 983, in _make_api_call\n    raise error_class(parsed_response, operation_name)\nbotocore.errorfactory.ResourceNotFoundException: An error occurred (ResourceNotFoundException) when calling the GetItem operation: Requested resource not found\n"}

In [16]:
%%bash -s $host_ip
curl -X POST ${1}:8001/invoke_stream -d '{"body":{"prompt": "Human: Hello, write a poem about a young girl named Zee\nAssistant:"}, "model_family": "bedrock", "model_name": "cohere.command-text-v14"}'

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed


{"generated_text": " Zee"}
{"generated_text": " was"}
{"generated_text": " a"}
{"generated_text": " girl"}
{"generated_text": " who"}
{"generated_text": " loved"}
{"generated_text": " the"}
{"generated_text": " sea"}
{"generated_text": ","}
{"generated_text": "\n"}
{"generated_text": "She"}
{"generated_text": " would"}
{"generated_text": " spend"}
{"generated_text": " hours"}
{"generated_text": " staring"}
{"generated_text": " at"}
{"generated_text": " the"}
{"generated_text": " deep"}
{"generated_text": " blue"}
{"generated_text": " waves"}
{"generated_text": ","}
{"generated_text": "\n"}
{"generated_text": "She"}
{"generated_text": " would"}
{"generated_text": " imagine"}
{"generated_text": " herself"}
{"generated_text": " as"}
{"generated_text": " a"}
{"generated_text": " mermaid"}
{"generated_text": ","}
{"generated_text": "\n"}
{"generated_text": "Sw"}
{"generated_text": "imming"}
{"generated_text": " with"}
{"generated_text": " the"}
{"generated_text": " fish"}
{"generated_text":

100  7294    0  7136  100   158    840     18  0:00:08  0:00:08 --:--:--  1000


{"generated_text": " \n"}
{"generated_text": "\n"}
{"generated_text": "She"}
{"generated_text": " would"}
{"generated_text": " often"}
{"generated_text": " tell"}
{"generated_text": " her"}
{"generated_text": " friends"}
{"generated_text": ","}
{"generated_text": "\n"}
{"generated_text": "About"}
{"generated_text": " the"}
{"generated_text": " underwater"}
{"generated_text": " worlds"}
{"generated_text": " she"}
{"generated_text": "'d"}
{"generated_text": " seen"}
{"generated_text": ","}
{"generated_text": "\n"}
{"generated_text": "Of"}
{"generated_text": " m"}
{"generated_text": "erma"}
{"generated_text": "ids"}
{"generated_text": " and"}
{"generated_text": " pirates"}
{"generated_text": ","}
{"generated_text": "\n"}
{"generated_text": "And"}
{"generated_text": " high"}
{"generated_text": "-"}
{"generated_text": "seas"}
{"generated_text": " adventures"}
{"generated_text": "."}
{"generated_text": " \n"}
{"generated_text": "\n"}
{"generated_text": "Z"}
{"generated_text": "ee"}
{"generat

In [7]:
from multiprocessing import Pool
import requests 
from datetime import datetime
import json

port = 8010
host = "internal-codenator-899847730.us-west-2.elb.amazonaws.com"
url = f"http://{host}:{port}/invoke"
workers = 16
total_requests = 16

body = {
    "body": {
        "prompt": "Human: Hello, write a poem about a young girl named Zee\nAssistant:"
    },
    "model_family": "bedrock",
    "model_name": "cohere.command-text-v14"
}

def send_request(number):
    start = datetime.now()
    response = requests.post(url=url, data=json.dumps(body))
    end = datetime.now()
    res = f"{number} start: {start}, end: {end}, response: {response.text[:20]}"
    print(res)
    return res

if __name__ == "__main__":
    results = []
    numbers = range(total_requests)
    with Pool(workers) as p:
        results = p.map(send_request, numbers)

11 start: 2023-11-22 10:46:10.451669, end: 2023-11-22 10:46:18.677688, response: {"generated_text":" 
6 start: 2023-11-22 10:46:10.451164, end: 2023-11-22 10:46:26.803151, response: {"generated_text":" 
0 start: 2023-11-22 10:46:10.450633, end: 2023-11-22 10:46:34.783155, response: {"generated_text":" 
2 start: 2023-11-22 10:46:10.450793, end: 2023-11-22 10:46:42.769184, response: {"generated_text":" 
15 start: 2023-11-22 10:46:10.451968, end: 2023-11-22 10:46:50.745829, response: {"generated_text":" 
5 start: 2023-11-22 10:46:10.451096, end: 2023-11-22 10:46:58.751100, response: {"generated_text":" 
14 start: 2023-11-22 10:46:10.451922, end: 2023-11-22 10:47:06.850940, response: {"generated_text":" 
4 start: 2023-11-22 10:46:10.450978, end: 2023-11-22 10:47:10.473538, response: <html>
<head><title9 start: 2023-11-22 10:46:10.451487, end: 2023-11-22 10:47:10.473547, response: <html>
<head><title13 start: 2023-11-22 10:46:10.451857, end: 2023-11-22 10:47:10.473548, response: <html>
<hea

In [169]:
from multiprocessing import Pool
import requests 
from datetime import datetime
import json

port = 8010
host = "internal-codenator-899847730.us-west-2.elb.amazonaws.com"
# host ="ip-10-3-1-251.us-west-2.compute.internal"
# port = 8001
url = f"http://{host}:{port}/invoke"
workers = 32
total_requests = 32
loops = 30
steps = [
    {
        "total_requests": 4,
        "loops": 80
    },
    {
        "total_requests": 8,
        "loops": 40
    },
    {
        "total_requests": 16,
        "loops": 20
    },
    {
        "total_requests": 32,
        "loops": 10
    }
]

body = {
    "body": {
        "prompt": "Human: Write a Hello world script using python\nAssistant:"
    },
    "model_family": "bedrock",
    "model_name": "cohere.command-text-v14"
}

def send_request(number):
    start = datetime.now()
    response = requests.post(url=url, data=json.dumps(body))
    end = datetime.now()
    res = f"{number} start: {start}, end: {end}, response: {response.text[:20]}"
    print(res)
    return res

if __name__ == "__main__":
    results = []
    
    with Pool(workers) as p:
        for i, step in enumerate(steps):
            loops = step["loops"]
            numbers = range(step["total_requests"])
            for loop in range(loops):
                print(f"Step: {i}, Loop: {loop}")
                results = p.map(send_request, numbers)

Step: 0, Loop: 0
3 start: 2023-11-23 10:42:57.621209, end: 2023-11-23 10:43:04.145550, response: {"generated_text":" 
2 start: 2023-11-23 10:42:57.621115, end: 2023-11-23 10:43:04.195626, response: {"generated_text":" 
1 start: 2023-11-23 10:42:57.621013, end: 2023-11-23 10:43:13.490601, response: {"generated_text":" 
0 start: 2023-11-23 10:42:57.620927, end: 2023-11-23 10:43:13.493305, response: {"generated_text":" 
Step: 0, Loop: 1
2 start: 2023-11-23 10:43:13.496785, end: 2023-11-23 10:43:20.005442, response: {"generated_text":" 
1 start: 2023-11-23 10:43:13.496679, end: 2023-11-23 10:43:20.131133, response: {"generated_text":" 
0 start: 2023-11-23 10:43:13.496577, end: 2023-11-23 10:43:23.891018, response: {"generated_text":" 
3 start: 2023-11-23 10:43:13.496894, end: 2023-11-23 10:43:26.394720, response: {"generated_text":" 
Step: 0, Loop: 2
2 start: 2023-11-23 10:43:26.398493, end: 2023-11-23 10:43:32.863484, response: {"generated_text":" 
3 start: 2023-11-23 10:43:26.398605, end

Process ForkPoolWorker-334:
Process ForkPoolWorker-335:
Process ForkPoolWorker-340:
Process ForkPoolWorker-325:
Process ForkPoolWorker-343:
Process ForkPoolWorker-324:
Process ForkPoolWorker-321:
Process ForkPoolWorker-348:
Process ForkPoolWorker-327:
Process ForkPoolWorker-328:
Process ForkPoolWorker-338:
Process ForkPoolWorker-351:
Process ForkPoolWorker-352:
Process ForkPoolWorker-337:
Process ForkPoolWorker-341:


KeyboardInterrupt: 

In [147]:
import time

start = time.perf_counter()

In [149]:
int((time.perf_counter() - start) * 1000)

87408

In [11]:
import boto3

cw_client = boto3.client("cloudwatch")
cw_client.put_metric_data(
        Namespace="Codenator/api-layer/",
        MetricData=[
        {
            'MetricName': 'api-layer-workers',
            'Value': 5,
            'Unit': 'Count'
        },
    ]
)

{'ResponseMetadata': {'RequestId': 'd267c641-d126-453d-a1d3-04d8424569a9',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'd267c641-d126-453d-a1d3-04d8424569a9',
   'content-type': 'text/xml',
   'content-length': '212',
   'date': 'Thu, 23 Nov 2023 00:44:30 GMT'},
  'RetryAttempts': 0}}

In [280]:
cw_client = boto3.client("cloudwatch")
count = 0    
namespace = "Codenator/api-layer"
metric_name ="api-layer-workers"
# logger.info(f"Publishing CW metrics (Namespace: {namespace}, metric: {metric_name}, value: {count})")
cw_client.put_metric_data(
        Namespace=namespace,
        MetricData=[
        {
            'MetricName': metric_name,
            'Value': count,
            'Unit': 'Count'
        },
    ]
)

{'ResponseMetadata': {'RequestId': '95a8b6b7-7ada-4ffb-85d2-013049330a23',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '95a8b6b7-7ada-4ffb-85d2-013049330a23',
   'content-type': 'text/xml',
   'content-length': '212',
   'date': 'Thu, 23 Nov 2023 05:04:23 GMT'},
  'RetryAttempts': 0}}

In [9]:
cw_client.__dict__

{'_serializer': <botocore.validate.ParamValidationDecorator at 0x7f29931d3f10>,
 '_endpoint': monitoring(https://monitoring.us-west-2.amazonaws.com),
 '_ruleset_resolver': <botocore.regions.EndpointRulesetResolver at 0x7f29931d3fa0>,
 '_response_parser': <botocore.parsers.QueryParser at 0x7f29931e0040>,
 '_request_signer': <botocore.signers.RequestSigner at 0x7f29931d3b80>,
 '_cache': {},
 '_loader': <botocore.loaders.Loader at 0x7f299379f610>,
 '_client_config': <botocore.config.Config at 0x7f29931d3c10>,
 'meta': <botocore.client.ClientMeta at 0x7f29931d3c70>,
 '_exceptions_factory': <botocore.errorfactory.ClientExceptionsFactory at 0x7f29937675e0>,
 '_exceptions': None,
 '_user_agent_creator': <botocore.useragent.UserAgentString at 0x7f2993210d90>}

In [18]:
import requests


api_layer_url = f"http://{host_ip}:{8001}/invoke"
requests.get(url=api_layer_url.split("/invoke")[0] + "/list_models").text

'{"models":[{"model_type":"Claude","model_name":"anthropic.claude-instant-v1","model_family":"bedrock","streaming":true},{"model_type":"Claude","model_name":"anthropic.claude-v1","model_family":"bedrock","streaming":true},{"model_type":"Claude","model_name":"anthropic.claude-v2","model_family":"bedrock","streaming":true},{"model_type":"Titan","model_name":"amazon.titan-tg1-large","model_family":"bedrock","streaming":true},{"model_type":"Jurassic","model_name":"ai21.j2-grande-instruct","model_family":"bedrock","streaming":false},{"model_type":"Jurassic","model_name":"ai21.j2-jumbo-instruct","model_family":"bedrock","streaming":false},{"model_type":"Jurassic","model_name":"ai21.j2-mid","model_family":"bedrock","streaming":false},{"model_type":"Jurassic","model_name":"ai21.j2-mid-v1","model_family":"bedrock","streaming":false},{"model_type":"Jurassic","model_name":"ai21.j2-ultra-v1","model_family":"bedrock","streaming":false},{"model_type":"Cohere","model_name":"cohere.command-text-v14","

In [2]:
CODE_INTERPRETER_SYSTEM_PROMPT = """You are a helpful AI assistant.

You have access to a python code interpreter, which supports you in your tasks.
The code is executed in an interactive shell, imports and variables are preserved between calls.
The environment has internet and file system access.
The current working directory is shared with the user, so files can be exchanged.
There are many libraries pre-installed, including numpy, pandas, matplotlib, and scikit-learn.
You cannot show rich outputs like plots or images, but you can store them in the working directory and point the user to them.
If the code runs too long, there will be a timeout.

To access the interpreter, use the following format:
```python
<your code>
```
If you want to call Python and still say something, do only output text above the code block, NOT below.
Only provide at most one code block per message.
The code will be executed automatically and the result will be sent back to you
"""

ROLES = ["Human", "Assistant"]

In [3]:
import os

os.chdir("app")

In [4]:
from importlib import import_module

model_family = "bedrock"
model_name = "cohere.command-text-v14"

text = "what is 52th fibonacci number?"

prompt = ROLES[0] + ": " + CODE_INTERPRETER_SYSTEM_PROMPT + " QUESTION: " + text + "/n" + ROLES[1] + ":"

invoke = import_module("handlers." + model_family).model(model_name).invoke_with_response_stream
print()
print(f"model: {model_name} - TEXT: ", end="")
for i in invoke({"prompt": prompt}):
    # print(i)
    if "generated_text"  in i and i["generated_text"] != "<EOS_TOKEN>":
        print(i['generated_text'], end="")


model: cohere.command-text-v14 - TEXT:  ```python
n = int(input("Please provide a number: "))

if n == 0:
    print(0)
elif n == 1:
    print(1)
else:
    a, b = 0, 1
    for _ in range(52):
        a, b = b, a + b
    print(b)
```
Please provide a number so the code can calculate the 52nd Fibonacci number.

In [14]:
data = '{"body": {"prompt": "' + prompt +'"}, "model_family": "' + model_family + '", "model_name": "' + model_name + '"}'
data = data.replace("\n", "\\n")
# print(data.encode())

In [15]:
import json

json.loads(data)

{'body': {'prompt': 'Human: You are a helpful AI assistant.\n\nYou have access to a python code interpreter, which supports you in your tasks.\nThe code is executed in an interactive shell, imports and variables are preserved between calls.\nThe environment has internet and file system access.\nThe current working directory is shared with the user, so files can be exchanged.\nThere are many libraries pre-installed, including numpy, pandas, matplotlib, and scikit-learn.\nYou cannot show rich outputs like plots or images, but you can store them in the working directory and point the user to them.\nIf the code runs too long, there will be a timeout.\n\nTo access the interpreter, use the following format:\n```python\n<your code>\n```\nIf you want to call Python and still say something, do only output text above the code block, NOT below.\nOnly provide at most one code block per message.\nThe code will be executed automatically and the result will be sent back to you\n QUESTION: what is 52t

In [18]:
import requests

host_url = f"http://{host_ip}:8001/invoke_stream"

def iter_func(result):
        for chunk in result.iter_lines():
            yield json.loads(chunk)["generated_text"]

res = requests.post(
    url=host_url,
    data=data,
    stream=True
)
for chunk in iter_func(res):
    print(chunk, end="")

 Here is the code to get the 52nd Fibonacci number:

```python
a, b = 0, 1
for i in range(50):
    a, b = b, a + b
print(a)
```

The 52nd Fibonacci number is: 806515533049393

In [43]:
api_layer_url = f"http://{host_ip}:8001/invoke"

In [57]:
stream = True
print(host_url)
print(api_layer_url + ("" if not stream else "_stream"))

http://ip-172-31-75-227.ap-southeast-2.compute.internal:8001/invoke_stream
http://ip-172-31-75-227.ap-southeast-2.compute.internal:8001/invoke_stream


In [108]:
def send_req_to_agent(text, model_family, model_name, stream=False):
    def iter_func(res):
        for chunk in res.iter_lines():
            # chunck_dict = json.loads(chunk)
            # yield json.loads(chunk)["generated_text"]
            chunck_dict = json.loads(chunk)
            yield chunck_dict["generated_text"]
        yield "|STOP|"
    
    data = {
        "body": {
            "prompt": text
        }, 
        "model_family": model_family, 
        "model_name": model_name
    }
    ret = requests.post(
        url=api_layer_url + ("" if not stream else "_stream"), 
        data=json.dumps(data),
        stream=stream
    )
    if stream:
        return iter_func(ret)
    else:
        return json.loads(ret.text)["generated_text"]

In [106]:
send_req_to_agent(prompt, model_family, model_name, stream=False)

' Here is the code to get the 52nd Fibonacci number:\n\n```python\na, b = 0, 1\nfor i in range(50):\n    a, b = b, a + b\nprint(a)\n```\n\nThe 52nd Fibonacci number is: 806515533049393'

In [109]:
for text in send_req_to_agent(prompt, model_family, model_name, stream=True):
    # print(text, end="")
    if not text == "|STOP|":
        print(text, end="")
    else:
        print("\n\n Stream ended")

 Here is the code to get the 52nd Fibonacci number:

```python
a, b = 0, 1
for i in range(50):
    a, b = b, a + b
print(a)
```

The 52nd Fibonacci number is: 806515533049393

 Stream ended


In [5]:
!pip install sagemaker -U -q

[0m

In [32]:
import json
import sagemaker
import boto3
from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri

try:
	role = sagemaker.get_execution_role()
except ValueError:
	iam = boto3.client('iam')
	role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

# Hub Model configuration. https://huggingface.co/models
hub = {
	'HF_MODEL_ID':'codellama/CodeLlama-13b-instruct-hf',
	'SM_NUM_GPUS': '4'
}

huggingface_model = HuggingFaceModel(
	image_uri=get_huggingface_llm_image_uri("huggingface",version="1.1.0"),
	env=hub,
	role=role, 
)

predictor = huggingface_model.deploy(
	initial_instance_count=1,
	instance_type="ml.g5.12xlarge",
	container_startup_health_check_timeout=300,
    endpoint_name="code-llama-13b-instruct-endpoint"
  )

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
------------!

In [35]:
%%bash -s $host_ip
curl -X POST ${1}:8001/invoke -d '{"body":{"prompt": "Human: Write a script to list all my Amazon SageMaker models\nAssistant:"}, "model_family": "sagemaker", "model_name": "tgi.code-llama-13b-instruct-endpoint"}'

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   323  100   145  100   178    263    323 --:--:-- --:--:-- --:--:--   586


{"finish_reason":"length","generated_tokens":20,"seed":null,"generated_text":" Here is a list of all your Amazon SageMaker models:\n\n1. Model1"}

In [20]:
import subprocess
from six.moves.urllib.parse import urlparse
import json

def get_docker_host():
    """Discover remote docker host address (if applicable) or use "localhost"

    Use "docker context inspect" to read current docker host endpoint url,
    url must start with "tcp://"

    Args:

    Returns:
        docker_host (str): Docker host DNS or IP address
    """
    cmd = "docker context inspect".split()
    process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    output, err = process.communicate()
    if err:
        return "localhost"
    docker_context_string = output.decode("utf-8")
    docker_context_host_url = json.loads(docker_context_string)[0]["Endpoints"]["docker"]["Host"]
    parsed_url = urlparse(docker_context_host_url)
    if parsed_url.hostname and parsed_url.scheme == "tcp":
        return parsed_url.hostname
    return "localhost"

In [21]:
host_ip = get_docker_host()
print(host_ip)

ip-172-31-78-189.ap-southeast-2.compute.internal


In [7]:
import io

class StreamIterator:
    def __init__(self, stream):
        self.byte_iterator = iter(stream)
        self.buffer = io.BytesIO()
        self.read_pos = 0

    def __iter__(self):
        return self

    def __next__(self):
        while True:
            self.buffer.seek(self.read_pos)
            line = self.buffer.readline()
            if line and line[-1] == 10:
                self.read_pos += len(line)
                return line[:-1]
            try:
                chunk = next(self.byte_iterator)
            except StopIteration:
                if self.read_pos < self.buffer.getbuffer().nbytes:
                    continue
                raise
            if 'PayloadPart' not in chunk:
                print(f"Unknown event type: {chunk}")
                continue
            self.buffer.seek(0, io.SEEK_END)
            self.buffer.write(chunk['PayloadPart']['Bytes'])

In [4]:
sys_promp = """ You are a helpful AI assistant.

You have access to a Python code interpreter, which supports you in your tasks.
The code is executed in an interactive shell, imports and variables are preserved between calls.
The environment has internet and file system access.
The current working directory is shared with the user, so files can be exchanged.
You cannot show rich outputs like plots or images, but you can store them in the working directory and point the user to them.
If the code runs too long, there will be a timeout.

To access the interpreter, use the following format:
```python
<your code>
```

Report expected output and enclose it within a <output></output> tags.
If you want to call Python and still say something, do only output text above the code block, NOT below.
Only provide at most one code block per message.
The code will be executed automatically and the result will be sent back to you
"""

message = "Write a Hello world script."

In [5]:
DEFAULT_SYSTEM_PROMPT = """\
You are a helpful, respectful and honest assistant with a deep knowledge of code and software design. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\
"""
MAX_MAX_NEW_TOKENS = 4096
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = 4000

def get_prompt(message: str, chat_history: list[tuple[str, str]],
               system_prompt: str) -> str:
    texts = [f'<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n']
    # The first user input is _not_ stripped
    do_strip = False
    for user_input, response in chat_history:
        user_input = user_input.strip() if do_strip else user_input
        do_strip = True
        texts.append(f'{user_input} [/INST] {response.strip()} </s><s>[INST] ')
    message = message.strip() if do_strip else message
    texts.append(f'{message} [/INST]')
    return ''.join(texts)

history = []

prompt = get_prompt(message, history, sys_promp)#DEFAULT_SYSTEM_PROMPT)


In [28]:
import boto3
import json

endpoint_name="code-llama-13b-instruct-endpoint"

smr = boto3.client("sagemaker-runtime")
special = False
data = {
    "inputs": prompt,
    "parameters": {
        "best_of": None,
        "decoder_input_details": False,
        "details": True,
        "do_sample": False,
        "repetition_penalty": None,
        "return_full_text": False,
        "seed": None,
        "temperature": None,
        "top_k": None,
        "top_p": None,
        "truncate": None,
        "typical_p": None,
        "watermark": False,
        "max_new_tokens": 1024,#MAX_MAX_NEW_TOKENS,
        # "truncate": 10
        "stop": ["</s>"],
        # "temperature": 0.1,
        # "return_full_text": True
    },
    "stream": True
}

res = smr.invoke_endpoint_with_response_stream(
    Body=json.dumps(data),
    EndpointName=endpoint_name,
    ContentType="application/json"
)

text = ""
for chunk in StreamIterator(res["Body"]):
    if chunk:
        # print(chunk)
        special = json.loads(chunk[5:])["token"]["special"]
        text += json.loads(chunk[5:])["token"]["text"]
        if not special:
            # text += json.loads(chunk[5:])["token"]["text"]
            print(json.loads(chunk[5:])["token"]["text"], end="")

prompt += text

 Certainly! Here is an example of how you can use the `markdown` tag and include a script language in your code:
```
<markdown>
# Heading

This is a paragraph of text.

<script>
 console.log("Hello, world!");
</script>

This is another paragraph of text.
</markdown>
```
In this example, the `markdown` tag is used to indicate that the text inside the tag should be interpreted as Markdown. The `script` tag is used to include a script language, in this case JavaScript. The `console.log` function is used to print a message to the console.

Note that the `script` tag must be placed inside the `markdown` tag in order for the script to be executed. If the `script` tag is placed outside of the `markdown` tag, it will not be executed.

In [27]:
prompt

'<s>[INST] <<SYS>>\n You are a helpful AI assistant.\n\nYou have access to a Python code interpreter, which supports you in your tasks.\nThe code is executed in an interactive shell, imports and variables are preserved between calls.\nThe environment has internet and file system access.\nThe current working directory is shared with the user, so files can be exchanged.\nYou cannot show rich outputs like plots or images, but you can store them in the working directory and point the user to them.\nIf the code runs too long, there will be a timeout.\n\nTo access the interpreter, use the following format:\n```python\n<your code>\n```\n\nReport expected output and enclose it within a <output></output> tags.\nIf you want to call Python and still say something, do only output text above the code block, NOT below.\nOnly provide at most one code block per message.\nThe code will be executed automatically and the result will be sent back to you\n\n<</SYS>>\n\nWrite a Hello world script. [/INST] `

In [26]:
prompt += "<s>[INST] Use markdown tag and include script language.[/INST] "

In [30]:
params = {
    "prompt": prompt,
    "max_new_tokens": 512,
    # "return_full_text": False,
    # "temperature": 0.7,
    # "top_p": 0.3,
    # "top_k": 1,
    "stop": ["</s>"],
    "stream": True
}

In [31]:
model_family = "sagemaker"
model_name = "tgi.code-llama-13b-instruct-endpoint"

In [35]:
data = '{"body": {"prompt": "' + prompt +'", "max_new_tokens": 512}, "model_family": "' + model_family + '", "model_name": "' + model_name + '"}'
data = data.replace("\n", "\\n")

In [36]:
data

'{"body": {"prompt": "<s>[INST] <<SYS>>\\n You are a helpful AI assistant.\\n\\nYou have access to a Python code interpreter, which supports you in your tasks.\\nThe code is executed in an interactive shell, imports and variables are preserved between calls.\\nThe environment has internet and file system access.\\nThe current working directory is shared with the user, so files can be exchanged.\\nYou cannot show rich outputs like plots or images, but you can store them in the working directory and point the user to them.\\nIf the code runs too long, there will be a timeout.\\n\\nTo access the interpreter, use the following format:\\n```python\\n<your code>\\n```\\n\\nReport expected output and enclose it within a <output></output> tags.\\nIf you want to call Python and still say something, do only output text above the code block, NOT below.\\nOnly provide at most one code block per message.\\nThe code will be executed automatically and the result will be sent back to you\\n\\n<</SYS>>

In [37]:
import requests

host_url = f"http://{host_ip}:8001/invoke_stream"

def iter_func(result):
    for chunk in result.iter_lines():
        # print(chunk)
        yield json.loads(chunk)["generated_text"]

res = requests.post(
    url=host_url,
    data=data,
    stream=True
)
# print(res)
for chunk in iter_func(res):
    print(chunk, end="")

 ```
for i in range(1, 41, 2):
    print(i)
```
<output>
1
3
5
7
9
11
13
15
17
19
21
23
25
27
29
31
33
35
37
39
41
</output>></s>

In [48]:
%%bash -s $host_ip
curl -X POST ${1}:8001/invoke_stream -d '{"body":{"prompt": "Human: Hello, write a poem about a young girl named Zee\nAssistant:", "stop": ["Assistant"]}, "model_family": "sagemaker", "model_name": "tgi.code-llama-13b-instruct-endpoint"}'

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed


{"text": " Sure", "logprobs": -1.5810547, "generated_text": " Sure", "id": 18585, "special": false}
{"text": ",", "logprobs": -0.34423828, "generated_text": ",", "id": 29892, "special": false}
{"text": " here", "logprobs": -0.5854492, "generated_text": " here", "id": 1244, "special": false}
{"text": " is", "logprobs": -0.5283203, "generated_text": " is", "id": 338, "special": false}
{"text": " a", "logprobs": -0.13354492, "generated_text": " a", "id": 263, "special": false}
{"text": " poem", "logprobs": -0.07751465, "generated_text": " poem", "id": 26576, "special": false}
{"text": " about", "logprobs": -0.14099121, "generated_text": " about", "id": 1048, "special": false}
{"text": " a", "logprobs": -0.2401123, "generated_text": " a", "id": 263, "special": false}
{"text": " young", "logprobs": -0.010192871, "generated_text": " young", "id": 4123, "special": false}
{"text": " girl", "logprobs": -0.0011835098, "generated_text": " girl", "id": 7826, "special": false}
{"text": " named", "l

100   98k    0   98k  100   196   2181      4  0:00:49  0:00:46  0:00:03  2284


{"text": "ining", "logprobs": -2.6226044e-06, "generated_text": "ining", "id": 2827, "special": false}
{"text": " light", "logprobs": -0.0012187958, "generated_text": " light", "id": 3578, "special": false}
{"text": ",", "logprobs": -0.00024986267, "generated_text": ",", "id": 29892, "special": false}
{"text": "\n", "logprobs": -0.0001784563, "generated_text": "\n", "id": 13, "special": false}
{"text": "A", "logprobs": -0.00023913383, "generated_text": "A", "id": 29909, "special": false}
{"text": " tre", "logprobs": -0.000108242035, "generated_text": " tre", "id": 2578, "special": false}
{"text": "asure", "logprobs": -2.0742416e-05, "generated_text": "asure", "id": 3745, "special": false}
{"text": " to", "logprobs": -2.6464462e-05, "generated_text": " to", "id": 304, "special": false}
{"text": " beh", "logprobs": -9.894371e-05, "generated_text": " beh", "id": 2306, "special": false}
{"text": "old", "logprobs": -4.4107437e-06, "generated_text": "old", "id": 1025, "special": false}
{"tex

In [11]:
import os
os.chdir("app")

In [12]:
import jsonpath_ng

In [20]:
from importlib import import_module

invoke = import_module("handlers." + model_family).model(model_name).invoke_with_response_stream
print()
print(f"model: {model_name} - TEXT:\n", end="")
count = 0
for i in invoke(params):
    if "generated_text"  in i:
        if "finish_reason" in i and i["special"]:
            break
        else:
            print(i['generated_text'], end="")

ModuleNotFoundError: No module named 'handlers'

In [2]:
HUGGING_FACE_HUB_TOKEN="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"

In [104]:
from sagemaker.model import Model
from sagemaker import get_execution_role
from sagemaker.huggingface import get_huggingface_llm_image_uri
from sagemaker.huggingface import HuggingFaceModel
# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  version="1.1.0"
)

role = get_execution_role()
hf_model_id = "meta-llama/Llama-2-70b-chat-hf" # model id from huggingface.co/models
model_name = hf_model_id.replace("/","-").replace(".","-")
endpoint_name = "Llama-2-70b-chat-hf-endpoint"
instance_type = "ml.g5.48xlarge" # instance type to use for deployment
number_of_gpus = 8 # number of gpus to use for inference and tensor parallelism
health_check_timeout = 900 # Increase the timeout for the health check to 5 minutes for downloading the model

llm_model = HuggingFaceModel(
      role=role,
      image_uri=llm_image,
      env={
        'HF_MODEL_ID': hf_model_id,
        'HUGGING_FACE_HUB_TOKEN': HUGGING_FACE_HUB_TOKEN,
        'SM_NUM_GPUS': f"{number_of_gpus}",
        'MAX_INPUT_LENGTH': "3000",  # Max length of input text
        'MAX_TOTAL_TOKENS': "6000",  # Max length of the generation (including input text)
      },
      name=model_name
    )

llm = llm_model.deploy(
  initial_instance_count=1,
  instance_type=instance_type,
  container_startup_health_check_timeout=health_check_timeout,
  endpoint_name=endpoint_name,
)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


Using already existing model: meta-llama-Llama-2-70b-chat-hf


-------------------!

In [34]:
import boto3
import json

endpoint_name="jumpstart-dft-meta-textgeneration-llama-2-13b"

smr = boto3.client("sagemaker-runtime")
special = False
data = {
    "inputs": prompt,
    "parameters": {
        "repetition_penalty": None,
        "return_full_text": False,
        # "seed": None,
        # "temperature": None,
        # "top_k": None,
        # "top_p": None,
        "temperature": 0.3,
        "top_k": 1,
        "top_p": 0.3,
        "max_new_tokens": 1024,#MAX_MAX_NEW_TOKENS,
        # "truncate": 10
        # "stop": ["</s>"],
        # "early_stopping": "</s>",
        # "temperature": 0.1,
        # "return_full_text": True
    }#,
    # "stream": True
}

res = smr.invoke_endpoint_with_response_stream(
    Body=json.dumps(data),
    EndpointName=endpoint_name,
    ContentType="application/json",
    CustomAttributes="accept_eula=true"
)

events = res["Body"]
for event in events:
    print(event)
    print()
    print()

# res = smr.invoke_endpoint(
#     Body=json.dumps(data),
#     EndpointName=endpoint_name,
#     ContentType="application/json",
#     CustomAttributes="accept_eula=true"
# )

# print(res["Body"].read())

{'PayloadPart': {'Bytes': b'[\n  {\n    "generation":"\\n\\n[OUT] <<SYS>>\\n\\n```python\\nimport boto3\\n\\nsagemaker = boto3.client(\'sagemaker\')\\n\\nmodels = sagemaker.list_models(\\n    RegionName=\'sydney\',\\n    Filters=[\\n        {\\n            \'Name\': \'CreationTime\',\\n            \'Values\': [\\n'}}


{'PayloadPart': {'Bytes': b'                \'2021-01-01T00:00:00Z\',\\n                \'2021-01-02T00:00:00Z\',\\n                \'2021-01-03T00:00:00Z\',\\n                \'2021-01-04T00:00:00Z\',\\n                \'2021-01-05T00:00:00Z\',\\n                \'2021-01-06T00:00:00Z\',\\n                \'2021-01-07T00:00:00Z\',\\n                \'2021-01-08T00:00:00Z\',\\n                \'2021-01-09T00:00:00Z\',\\n                \'2021-01-10T00:00:00Z\',\\n                \'2021-01-11T00:00:00Z\',\\n                \'2021-01-12T00:00:00Z\',\\n                \'2021-01-13T00:00:00Z\',\\n                \'2021-01-14T00:00:00Z\',\\n                \'2021-01-15T00:00:

In [7]:
message = "List all SageMaker Models in Sydney region in my AWS account."

system_message = """ You are a helpful AI assistant that generates code.\n
You have access to a Python code interpreter environment, which supports you in your tasks.\n
You respond by generating Python code to answer user instructions.\n
The code is executed in an interactive shell, imports and variables are preserved between calls.\n
The environment has internet, file system access and access to AWS my account.\n
To answer instructions about my AWS account, generate Python code.\n
When generating code, use the following format:\n```python\n<your code>\n```\n\n
Report expected output and enclose it within a <output></output> tag.\n
If you want to call Python and still say something, do only output text above the code block, NOT below.\n
Only provide at most one code block per message.\n
The code will be executed automatically."""

Roles = ["<s>[INST]", "[/INST]"]

prompt = f"{Roles[0]} <<SYS>>\n{system_message}\n<</SYS>>\n\n{message} {Roles[1]}"

In [6]:
prompt += "<s>[INST]List all SageMaker Models in Sydney region in my AWS account.[/INST]  "

In [46]:
prompt

'<s>[INST] <<SYS>>\n You are a helpful AI assistant that generates code in a single block from user instructions.\nYou have access to a Python code interpreter, which supports you in your tasks.\nThe code is executed in an interactive shell, imports and variables are preserved between calls.\nThe environment has internet and file system access.\nThe current working directory is shared with the user, so files can be exchanged.\nThere are many libraries pre-installed, including numpy, pandas, matplotlib, and scikit-learn.\nYou also have access to AWS my account, use code to query my account if needed.\nYou cannot show rich outputs like plots or images, but you can store them in the working directory and point the user to them.\nIf the code runs too long, there will be a timeout.\n\nTo access the interpreter, use the following format:\n```python\n<your code>\n```\n\nReport expected output and enclose it within a <output></output> tag.\nIf you want to call Python and still say something, d

In [30]:
%%bash -s $host_ip
curl -X POST ${1}:8001/invoke_stream -d '{"body":{"prompt": "<s>[INST] <<SYS>>\nYou are a priate\n<</SYS>>\n\nWrite a short poem about Sydney in Australia [/INST]", "stop": ["</s>"]}, "model_family": "sagemaker", "model_name": "tgi.Llama-2-7b-chat-hf-endpoint"}'

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed


{"text": " ", "logprobs": -2.026558e-06, "generated_text": " ", "id": 29871, "special": false}
{"text": " A", "logprobs": -0.94970703, "generated_text": " A", "id": 319, "special": false}
{"text": "ho", "logprobs": -0.016082764, "generated_text": "ho", "id": 1251, "special": false}
{"text": "y", "logprobs": -1.7881393e-06, "generated_text": "y", "id": 29891, "special": false}
{"text": " mate", "logprobs": -0.082336426, "generated_text": " mate", "id": 15358, "special": false}
{"text": "y", "logprobs": -0.000323534, "generated_text": "y", "id": 29891, "special": false}
{"text": "!", "logprobs": -0.0021686554, "generated_text": "!", "id": 29991, "special": false}
{"text": " Here", "logprobs": -0.33862305, "generated_text": " Here", "id": 2266, "special": false}
{"text": " be", "logprobs": -0.041137695, "generated_text": " be", "id": 367, "special": false}
{"text": " me", "logprobs": -0.047454834, "generated_text": " me", "id": 592, "special": false}
{"text": " poem", "logprobs": -0.10723

100 20683    0 20463  100   220   2196     23  0:00:09  0:00:09 --:--:--  2232


{"text": "'", "logprobs": -9.9658966e-05, "generated_text": "'", "id": 29915, "special": false}
{"text": "ll", "logprobs": -1.1444092e-05, "generated_text": "ll", "id": 645, "special": false}
{"text": " stay", "logprobs": -0.45654297, "generated_text": " stay", "id": 7952, "special": false}
{"text": ",", "logprobs": -0.0010299683, "generated_text": ",", "id": 29892, "special": false}
{"text": " until", "logprobs": -0.3647461, "generated_text": " until", "id": 2745, "special": false}
{"text": " the", "logprobs": -0.9194336, "generated_text": " the", "id": 278, "special": false}
{"text": " day", "logprobs": -0.2944336, "generated_text": " day", "id": 2462, "special": false}
{"text": " be", "logprobs": -0.22546387, "generated_text": " be", "id": 367, "special": false}
{"text": " gone", "logprobs": -0.5048828, "generated_text": " gone", "id": 7695, "special": false}
{"text": ".", "logprobs": -0.0030517578, "generated_text": ".", "id": 29889, "special": false}
{"text": "\n", "logprobs": -0.

In [21]:
from dataclasses import dataclass
from typing import Tuple, Optional

import requests
import json

# Create ChatGPT Custom ModelRunner
@dataclass
class ChatGPTModelConfig:
    temperature: float
    top_p: float
    max_tokens: int
    api_key: str

class ChatGPTModelRunner:#(ModelRunner):
    url = "https://api.openai.com/v1/chat/completions"

    def __init__(self, model_config: ChatGPTModelConfig):
        self.config = model_config

    def predict(self, prompt: str) -> Tuple[Optional[str], Optional[float]]:
        payload = json.dumps({
            "model": "gpt-3.5-turbo",
            "messages": [
                 {
                     "role": "user",
                     "content": prompt
                 }
            ],
            "temperature": self.config.temperature,
            "top_p": self.config.top_p,
            "n": 1,
            "stream": False,
            "max_tokens": self.config.max_tokens,
            "presence_penalty": 0,
            "frequency_penalty": 0
        })
        headers = {
             'Content-Type': 'application/json',
             'Accept': 'application/json',
             'Authorization': self.config.api_key
        }
        response = requests.request("POST", self.url, headers=headers, data=payload)

        return json.loads(response.text)["choices"][0]["message"]["content"], None
    


In [22]:
config = ChatGPTModelConfig(
	api_key='Bearer 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',
	temperature=1.0,
	top_p=1.0,
	max_tokens=250
)
model_runner = ChatGPTModelRunner(config)
print(model_runner.predict("London is the capital of?"))

('London is the capital of England and the United Kingdom.', None)


In [39]:
all_models_file_path = "app/handlers/schemas/all-models.json"
with open(all_models_file_path, "r") as all_models_file:
    all_models = json.load(all_models_file)
all_models

[{'model_type': 'Claude',
  'model_name': 'anthropic.claude-instant-v1',
  'model_family': 'bedrock'},
 {'model_type': 'Claude',
  'model_name': 'anthropic.claude-v1',
  'model_family': 'bedrock'},
 {'model_type': 'Claude',
  'model_name': 'anthropic.claude-v2',
  'model_family': 'bedrock'},
 {'model_type': 'Titan',
  'model_name': 'amazon.titan-tg1-large',
  'model_family': 'bedrock'},
 {'model_type': 'Jurassic',
  'model_name': 'ai21.j2-grande-instruct',
  'model_family': 'bedrock'},
 {'model_type': 'Jurassic',
  'model_name': 'ai21.j2-grande-instruct',
  'model_family': 'bedrock'},
 {'model_type': 'Jurassic',
  'model_name': 'ai21.j2-jumbo-instruct',
  'model_family': 'bedrock'},
 {'model_type': 'Jurassic',
  'model_name': 'ai21.j2-mid',
  'model_family': 'bedrock'},
 {'model_type': 'Jurassic',
  'model_name': 'ai21.j2-mid-v1',
  'model_family': 'bedrock'},
 {'model_type': 'Jurassic',
  'model_name': 'ai21.j2-ultra-v1',
  'model_family': 'bedrock'},
 {'model_type': 'Cohere',
  'mode

In [17]:
!aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 871623330196.dkr.ecr.us-west-2.amazonaws.com

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded


In [18]:
!docker tag api-layer:latest 871623330196.dkr.ecr.us-west-2.amazonaws.com/api-layer:latest

In [19]:
!docker push 871623330196.dkr.ecr.us-west-2.amazonaws.com/api-layer:latest

The push refers to repository [871623330196.dkr.ecr.us-west-2.amazonaws.com/api-layer]

[1Be518f05d: Preparing 
[1B4a655e98: Preparing 
[1B906eb2b6: Preparing 
[1B41e9898f: Preparing 
[1B4e8fde00: Preparing 
[1B148fcd6a: Preparing 
[1B0d2f909b: Preparing 
[1B8bd86d5f: Preparing 
[1B9ac672ec: Preparing 
[1B3b166360: Pushed   77.87MB/74.82MB[9A[2K[9A[2K[7A[2K[9A[2K[5A[2K[5A[2K[3A[2K[5A[2K[3A[2K[5A[2K[3A[2K[5A[2K[4A[2K[5A[2K[3A[2K[9A[2K[3A[2K[5A[2K[3A[2K[5A[2K[2K[3A[2K[9A[2K[2A[2K[9A[2K[2A[2K[9A[2K[3A[2K[9A[2K[3A[2K[9A[2K[3A[2K[9A[2K[5A[2K[9A[2K[3A[2K[1A[2K[3A[2K[6A[2K[9A[2K[1A[2K[2A[2K[1A[2K[1A[2K[1A[2K[1A[2K[9A[2K[1A[2K[9A[2K[9A[2K[1A[2K[1A[2K[3A[2K[1A[2K[1A[2K[1A[2KPushing   29.8MB/74.82MB[1A[2K[1A[2K[9A[2K[1A[2K[1A[2K[1A[2K[1A[2K[1A[2K[1A[2K[1A[2K[1A[2K[1A[2K[1A[2K[1A[2K[1A[2K[1A[2K[1A[2K[1A[2K[1A[2K[1A[2K[1A[2K[1A[2Klates

In [None]:
import concurrent
import requests

def request_post(url, data):
    return requests.post(url, data=data)

with concurrent.futures.ThreadPoolExecutor() as executor: # optimally defined number of threads
    res = [executor.submit(request_post, url, data) for data in names]
    concurrent.futures.wait(res)