In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from benchmarking.payload import create_test_payload


PAYLOADS = {
    "input_128_output_128": create_test_payload(input_words=128, output_tokens=128),
}

MODELS = {
    "falcon-7b-jumpstart": {
        "jumpstart_model_specs": {"model_args": {"model_id": "huggingface-llm-falcon-7b-bf16"}},
        "huggingface_model_id": "tiiuae/falcon-7b",
    },
}

In [None]:
from benchmarking.concurrency_probe import ConcurrentProbeIteratorBase


class ConcurrentProbeExponentialScalingIterator(ConcurrentProbeIteratorBase):
    """A custom concurrency probe iterator to explore concurrent request multiples of 25."""

    def __iter__(self):
        self.concurrent_requests = 25
        self.increment_value = 25
        self.max_latency_seconds = 25.
        return self
    
    def __next__(self) -> int:
        if self.exception is not None:
            self.stop_reason = f"Error occured: {self.exception}"
            raise StopIteration

        if self.result is None:
            return self.concurrent_requests
        
        last_latency_seconds = self.result["Latency"]["p90"] / 1e3
        if (last_latency_seconds > self.max_latency_seconds):
            self.stop_reason = f"Last p90 latency = {last_latency_seconds} > {self.max_latency_seconds}."
            raise StopIteration
        
        self.concurrent_requests = self.concurrent_requests + self.increment_value
        return self.concurrent_requests


def num_invocation_scaler_with_minimum(concurrent_requests: int) -> int:
    return min(concurrent_requests * 3, 200)

def num_invocation_time_estimate(concurrent_requests: int) -> int:
    time_per_request_seconds = 10
    total_time_seconds = 300
    return int(total_time_seconds / time_per_request_seconds * concurrent_requests)


In [None]:
from benchmarking.runner import Benchmarker


benchmarker = Benchmarker(
    payloads=PAYLOADS,
    run_concurrency_probe=True,
    concurrency_probe_concurrent_request_iterator_cls=ConcurrentProbeExponentialScalingIterator,
    concurrency_probe_num_invocation_hook=num_invocation_scaler_with_minimum,
)
metrics = benchmarker.run_multiple_models(models=MODELS)

In [None]:
import pandas as pd
from benchmarking.runner import Benchmarker


df = Benchmarker.load_metrics_pandas()
df_pivot = Benchmarker.create_concurrency_probe_pivot_table(df)

pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", 0)
pd.set_option("display.max_rows", 500)
display(df_pivot)