In [3]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
from benchmarking.payload import create_test_payload


PAYLOADS = {
    # "sample_input": {
    #     "inputs":"The new movie that got Oscar this year",
    #     "parameters":{"max_new_tokens":256, "do_sample":True}
    # }
    "input_128_output_128": create_test_payload(input_words=128, output_tokens=32),
    "input_512_output_256": create_test_payload(input_words=512, output_tokens=256),
}

MODELS = {
    "falcon-7b-g5-2xlarge": {
        "model_specs": {
            "image_uri_args": {
                "framework": "huggingface-llm",
                "version": "1.1.0",
            },
            "model_args": {
                "env": {
                    "HF_MODEL_ID": "tiiuae/falcon-7b",
                    "SM_NUM_GPUS": "1",
                    "MAX_CONCURRENT_REQUESTS": "512",
                    "MAX_BATCH_PREFILL_TOKENS": "16384"
                },
            },
            "deploy_args": {
                "initial_instance_count": 1,
                "instance_type": "ml.g5.2xlarge",
                "container_startup_health_check_timeout": 1200,
            }
        },
        "huggingface_model_id": "tiiuae/falcon-7b",
    },
    "falcon-7b-p4d-24xlarge": {
        "model_specs": {
            "image_uri_args": {
                "framework": "huggingface-llm",
                "version": "1.1.0",
            },
            "model_args": {
                "env": {
                    "HF_MODEL_ID": "tiiuae/falcon-7b",
                    "SM_NUM_GPUS": "1",
                    "MAX_CONCURRENT_REQUESTS": "512",
                    "MAX_BATCH_PREFILL_TOKENS": "16384"
                },
            },
            "deploy_args": {
                "initial_instance_count": 1,
                "instance_type": "ml.p4d.24xlarge",
                "container_startup_health_check_timeout": 1200,
            }
        },
        "huggingface_model_id": "tiiuae/falcon-7b",
    },
}

In [5]:
from functools import partial
from pathlib import Path

from benchmarking.runner import Benchmarker
from benchmarking.concurrency_probe import num_invocation_scaler

save_file_path = Path("./metrics_falcon_instances.json")
benchmarker = Benchmarker(
    payloads=PAYLOADS,
    run_concurrency_probe=True,
    saved_metrics_path=save_file_path,
    concurrency_probe_num_invocation_hook=partial(num_invocation_scaler, num_invocation_factor=5),
)
metrics = benchmarker.run_multiple_models(models=MODELS, save_file_path=save_file_path)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ubuntu/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ubuntu/.config/sagemaker/config.yaml
2023-10-18 21:32:45,602 | INFO : (Model 'falcon-7b-g5-2xlarge'): Deploying endpoint bm-falcon-7b-g5-2xlarge-2023-10-18-21-32-45-602 ...
2023-10-18 21:32:45,603 | INFO : (Model 'falcon-7b-p4d-24xlarge'): Deploying endpoint bm-falcon-7b-p4d-24xlarge-2023-10-18-21-32-45-603 ...
2023-10-18 21:32:45,605 | INFO : Defaulting to only available Python version: py39
2023-10-18 21:32:45,605 | INFO : Defaulting to only available Python version: py39
2023-10-18 21:32:45,719 | INFO : Defaulting to only supported image scope: gpu.
2023-10-18 21:32:45,721 | INFO : Defaulting to only supported image sc

In [5]:
from pathlib import Path

import pandas as pd
from benchmarking.runner import Benchmarker


save_file_path = Path("./metrics_falcon_instances.json")
df = Benchmarker.load_metrics_pandas(save_file_path=save_file_path)
df_pivot = Benchmarker.create_concurrency_probe_pivot_table(
    df,
    value_format_dict={
        # "TokenThroughput": int,
        # "LatencyPerToken.p90": int,
        "TimeToGenerate1MTokens": "{:,.2f}".format,
        "CostToGenerate1MTokens": "${:,.2f}".format,
    },
    value_name_dict={
        # "LatencyPerToken.p90": "p90 latency (ms/token)",
        # "TokenThroughput": "throughput (tokens/s)",
        "TimeToGenerate1MTokens": "time to generate 1M tokens (hr)",
        "CostToGenerate1MTokens": "cost to generate 1M tokens ($)",
    }
)

pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", 0)
pd.set_option("display.max_rows", 500)
display(df_pivot)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,time to generate 1M tokens (hr),time to generate 1M tokens (hr),time to generate 1M tokens (hr),time to generate 1M tokens (hr),time to generate 1M tokens (hr),time to generate 1M tokens (hr),time to generate 1M tokens (hr),time to generate 1M tokens (hr),time to generate 1M tokens (hr),cost to generate 1M tokens ($),cost to generate 1M tokens ($),cost to generate 1M tokens ($),cost to generate 1M tokens ($),cost to generate 1M tokens ($),cost to generate 1M tokens ($),cost to generate 1M tokens ($),cost to generate 1M tokens ($),cost to generate 1M tokens ($)
Unnamed: 0_level_1,Unnamed: 1_level_1,concurrent requests,1,2,4,8,16,32,64,128,256,1,2,4,8,16,32,64,128,256
model ID,instance type,payload,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
falcon-7b-g5-2xlarge,ml.g5.2xlarge,input_512_output_256,8.74,4.55,2.37,1.25,0.74,0.45,0.33,--,--,$13.24,$6.89,$3.59,$1.90,$1.12,$0.67,$0.50,--,--
falcon-7b-p4d-24xlarge,ml.p4d.24xlarge,input_512_output_256,3.84,2.23,1.05,0.55,0.33,0.2,0.15,0.12,0.11,$144.60,$84.23,$39.61,$20.81,$12.31,$7.67,$5.51,$4.63,$4.03


In [9]:
benchmarker = Benchmarker(PAYLOADS, saved_metrics_path=save_file_path)
print(benchmarker.model_id_to_endpoint_name)
benchmarker.clean_up_resources()

{'llama-2-13b-tgi-p4d-24xlarge': 'bm-llama-2-13b-tgi-p4d-24xlarge-2023-10-18-19-15-24-376', 'llama-2-13b-tgi': 'bm-llama-2-13b-tgi-2023-10-18-16-29-11-752', 'llama-2-13b-tgi-g5-48xlarge': 'bm-llama-2-13b-tgi-g5-48xlarge-2023-10-18-19-15-24-373'}
2023-10-18 21:27:35,601 | INFO : (Model 'llama-2-13b-tgi-p4d-24xlarge'): Cleaning up resources ...
2023-10-18 21:27:36,065 | INFO : Deleting model with name: bm-llama-2-13b-tgi-p4d-24xlarge-2023-10-18-19-15-24-376
2023-10-18 21:27:36,302 | INFO : Deleting endpoint configuration with name: bm-llama-2-13b-tgi-p4d-24xlarge-2023-10-18-19-15-24-376
2023-10-18 21:27:36,511 | INFO : Deleting endpoint with name: bm-llama-2-13b-tgi-p4d-24xlarge-2023-10-18-19-15-24-376
2023-10-18 21:27:36,647 | INFO : (Model 'llama-2-13b-tgi'): Cleaning up resources ...
2023-10-18 21:27:36,978 | INFO : Deleting model with name: bm-llama-2-13b-tgi-2023-10-18-16-29-11-752
2023-10-18 21:27:37,815 | INFO : Deleting endpoint configuration with name: bm-llama-2-13b-tgi-2023-10