In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from benchmarking.payload import create_test_payload


PAYLOADS = {
    "input_128_output_128": create_test_payload(input_words=128, output_tokens=128),
    "input_512_output_128": create_test_payload(input_words=512, output_tokens=128),
}

MODELS = {
    "falcon-7b-jumpstart": {
        "jumpstart_model_specs": {"model_args": {"model_id": "huggingface-llm-falcon-7b-bf16"}},
        "huggingface_model_id": "tiiuae/falcon-7b",
    },
    # "falcon-7b-jumpstart-2": {
    #     "jumpstart_model_specs": {
    #         "model_args": {"model_id": "huggingface-llm-falcon-7b-bf16"},
    #         "deploy_args": {"initial_instance_count": 2},
    #     },
    #     "huggingface_model_id": "tiiuae/falcon-7b",
    # },
    "falcon-7b-lmi": {
        "model_specs": {
            "image_uri_args": {
                "framework": "djl-deepspeed",
                "version": "0.23.0",
            },
            "model_args": {
                "env": {
                    "SERVING_LOAD_MODELS": "test::MPI=/opt/ml/model",
                    "OPTION_MODEL_ID": "tiiuae/falcon-7b",
                    "OPTION_TASK": "text-generation",
                    "OPTION_TRUST_REMOTE_CODE": "true",
                    "OPTION_TENSOR_PARALLEL_DEGREE": "1",
                    "OPTION_MAX_ROLLING_BATCH_SIZE": "32",
                    "OPTION_ROLLING_BATCH": "lmi-dist",
                    "OPTION_MAX_ROLLING_BATCH_PREFILL_TOKENS": "1560",
                    "OPTION_DTYPE": "fp16"
                },
            },
            "deploy_args": {
                "initial_instance_count": 1,
                "instance_type": "ml.g5.2xlarge",
                "container_startup_health_check_timeout": 600,
            }
        },
        "huggingface_model_id": "tiiuae/falcon-7b",
    },
    "falcon-7b-tgi": {
        "model_specs": {
            "image_uri_args": {
                "framework": "huggingface-llm",
                "version": "1.1.0",
            },
            "model_args": {
                "env": {
                    "HF_MODEL_ID": "tiiuae/falcon-7b",
                    "SM_NUM_GPUS": "1"
                },
            },
            "deploy_args": {
                "initial_instance_count": 1,
                "instance_type": "ml.g5.2xlarge",
                "container_startup_health_check_timeout": 600,
            }
        },
        "huggingface_model_id": "tiiuae/falcon-7b",
    },
}

In [None]:
from benchmarking.runner import Benchmarker


benchmarker = Benchmarker(payloads=PAYLOADS, run_concurrency_probe=True)
metrics = benchmarker.run_multiple_models(models=MODELS)

In [None]:
import pandas as pd
from benchmarking.runner import Benchmarker


df = Benchmarker.load_metrics_pandas()
df_pivot = Benchmarker.create_concurrency_probe_pivot_table(df)

pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", 0)
pd.set_option("display.max_rows", 500)
display(df_pivot)

In [None]:
benchmarker.clean_up_resources()