### Load session and Hugging Face token

In [None]:
!pip install huggingface_hub --quiet
!git config --global credential.helper store

In [None]:
import boto3
import sagemaker
from botocore.config import Config

boto_session = boto3.session.Session()
region = boto_session.region_name

no_retry_config = Config(retries={'max_attempts': 1})

sm_session = sagemaker.Session(
    boto_session=boto3.session.Session(),
    sagemaker_client=boto3.client("sagemaker", config=no_retry_config),
    sagemaker_runtime_client=boto3.client("sagemaker-runtime", config=no_retry_config),
)

role = sagemaker.get_execution_role()

In [None]:
from huggingface_hub import notebook_login

notebook_login()

### Serving properties for LMI v15 and LMI v14

Using `async_mode` for serving in the LMI v15 container, and using `rolling_batch` in the LMI v14 container.

In [None]:
from huggingface_hub import HfFolder
from sagemaker.djl_inference.model import DJLModel
import os

# Images for LMIv14 and LMIv15
image_v14 = f"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.32.0-lmi14.0.0-cu124"
image_v15 = f"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.33.0-lmi15.0.0-cu128-v1.0"

# Serving properties
HF_MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
MAX_MODEL_LEN = "1024"
BATCH_SIZE = "64"

lmi_v15_config = {
    "HF_MODEL_ID": HF_MODEL_ID,
    "HF_TOKEN": HfFolder.get_token(),
    "OPTION_TRUST_REMOTE_CODE": "true",
    "SERVING_ENGINE": "Python",
    "OPTION_MAX_ROLLING_BATCH_SIZE":BATCH_SIZE,
    "OPTION_MODEL_LOADING_TIMEOUT":"1800",
    "OPTION_MAX_MODEL_LEN": MAX_MODEL_LEN,
    "SERVING_FAIL_FAST":"true",
    "OPTION_ROLLING_BATCH":"disable",
    "OPTION_ASYNC_MODE":"true",
    "OPTION_ENTRYPOINT":"djl_python.lmi_vllm.vllm_async_service",
}
model_v15 = DJLModel(
    env=lmi_v15_config,
    role=role,
    image_uri=image_v15,
    )

lmi_v14_config = {
    "HF_MODEL_ID": HF_MODEL_ID,
    "HF_TOKEN": HfFolder.get_token(),
    "OPTION_TRUST_REMOTE_CODE": "true",
    "SERVING_ENGINE": "Python",
    "OPTION_TENSOR_PARALLEL_DEGREE": "1",
    "OPTION_MAX_ROLLING_BATCH_SIZE": BATCH_SIZE,
    "OPTION_ROLLING_BATCH": "vllm",
    "OPTION_MAX_ROLLING_BATCH_PREFILL_TOKENS": MAX_MODEL_LEN,
    "OPTION_MAX_MODEL_LEN": MAX_MODEL_LEN,
    "OPTION_ENABLE_PREFIX_CACHING": "False",
}

model_v14 = DJLModel(
    env=lmi_v14_config,
    role=role,
    image_uri=image_v14,
    )

### Deploy model with LMI v15 and LMI v14

In [None]:
INSTANCE_TYPE = "ml.g6e.2xlarge"

predictor_v15 = model_v15.deploy(
    initial_instance_count=1,
    instance_type=INSTANCE_TYPE,
    endpoint_name=sagemaker.utils.name_from_base("lmi-v15"),
    )

In [None]:
predictor_v14 = model_v14.deploy(
    initial_instance_count=1,
    instance_type=INSTANCE_TYPE,
    endpoint_name=sagemaker.utils.name_from_base("lmi-v14"),
    )

### Invoking the model using the OpenAI schema

Changing the shape of the request depending on the version of the LMI container being used. The following cells download Shakespeare's sonnet to use it in token benchmarking tests, in an approach similar to `LLMPerf` but without having to install the library.

In [None]:
!curl -O https://raw.githubusercontent.com/ray-project/llmperf/refs/heads/main/src/llmperf/sonnet.txt

### Calculate metrics

In [None]:
import numpy as np 
import datetime
import time
import boto3   
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
import numpy as np
from tqdm import tqdm

from benchmarking_utils import inference_latency

In [None]:
from typing import Callable, Tuple, Any
from joblib import Parallel, delayed
from tqdm import tqdm
import numpy as np
from transformers import LlamaTokenizerFast

def run_benchmark(
    predictor: Callable,
    number_of_clients: int,
    number_of_runs: int,
    openai_chat_completions: bool
) -> Tuple[float, float]:
    """
    Run a benchmark to measure token throughput and median latency of a prediction function.

    Args:
        predictor (Callable): The function or model used to generate predictions.
        number_of_clients (int): The number of parallel clients to simulate.
        number_of_runs (int): The total number of benchmark runs to execute.
        openai_chat_completions (bool): Flag indicating if OpenAI-style chat completions 
                                        are being used (True) or if we are following the
                                        Hugging Face schema (False).

    Returns:
        Tuple[float, float]: A tuple containing:
            - token_throughput (float): Total tokens generated divided by elapsed time (tokens/sec).
            - p50_latency_ms (float): The 50th percentile (median) latency in milliseconds.
    """
    progress_bar = tqdm(range(number_of_runs), position=0, leave=True)

    results = Parallel(n_jobs=number_of_clients, prefer="threads")(
        delayed(inference_latency)(predictor, openai_chat_completions)
        for _ in progress_bar
    )

    latencies = [res['latency'] for res in results]
    p50_latency_ms = float(np.quantile(latencies, 0.50))

    if openai_chat_completions:
        tokens = [res['result']['usage']['completion_tokens'] for res in results]
    else:
        tokenizer = LlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer")
        tokens = [
            len(tokenizer.encode(res['result']['generated_text']))
            for res in results
        ]

    elapsed_time = progress_bar.format_dict['elapsed'] or 1e-6  # Avoid division by zero
    token_throughput = sum(tokens) / elapsed_time

    return token_throughput, p50_latency_ms

In [None]:
import pandas as pd
from typing import Callable, List
from pandas import DataFrame

def store_metrics(
    predictor: Callable,
    num_requests: int = 512,
    num_clients_list: List[int] = [16, 32, 64],
    openai_chat_completions: bool = True
) -> DataFrame:
    """
    Run benchmark tests with varying levels of concurrency and collect performance metrics.

    Args:
        predictor (Callable): The function or model used to generate predictions.
        num_requests (int, optional): Total number of requests to simulate for each concurrency level. Default is 512.
        num_clients_list (List[int], optional): A list of client counts (concurrency levels) to test. Default is [16, 32, 64].
        openai_chat_completions (bool, optional): Flag indicating whether the model returns OpenAI-style completions 
                                                  with token usage stats (True) or requires manual tokenization (False).

    Returns:
        pd.DataFrame: A DataFrame containing the p50 latency (ms) and token throughput (tokens/sec) for each concurrency level.
    """
    p50_latency_list = []
    token_throughput_list = []

    for num_clients in num_clients_list:
        avg_token_throughput, p50_latency = run_benchmark(
            predictor,
            number_of_clients=num_clients,
            number_of_runs=num_requests,
            openai_chat_completions=openai_chat_completions
        )
        p50_latency_list.append(p50_latency)
        token_throughput_list.append(avg_token_throughput)

    results_df = pd.DataFrame({
        "p50_latency_ms": p50_latency_list,
        "token_per_s": token_throughput_list,
    }, index=num_clients_list)

    results_df.index.name = "num_clients"

    return results_df

### Run benchmarking tests and plot results

In [None]:
# Define parameters for benchmarking test
num_clients = [16,32,64]
num_requests = 512

# Run tests for LMIv15 and LMIv14
df_v15 = store_metrics(predictor_v15, num_requests, num_clients, True)
df_v15.index = num_clients

df_v14 = store_metrics(predictor_v14, num_requests, num_clients, False)
df_v14.index = num_clients

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

fig, axes = plt.subplots(1, 2, figsize=(10, 5))

# Plot p50 Latency
axes[0].plot(df_v14.index, df_v14["p50_latency_ms"], marker='o', linestyle='--', color='steelblue', label='LMIv14')
axes[0].plot(df_v15.index, df_v15["p50_latency_ms"], marker='o', linestyle='--', color='firebrick', label='LMIv15')
axes[0].set_title("p50 Latency")
axes[0].set_xlabel("Concurrent Clients")
axes[0].set_ylabel("Latency (ms)")
axes[0].grid(True)
axes[0].legend()

# Plot Token Throughput
axes[1].plot(df_v14.index, df_v14["token_per_s"], marker='o', linestyle='--', color='steelblue', label='LMIv14')
axes[1].plot(df_v15.index, df_v15["token_per_s"], marker='o', linestyle='--', color='firebrick', label='LMIv15')
axes[1].set_title("Throughput")
axes[1].set_xlabel("Concurrent Clients")
axes[1].set_ylabel("Tokens per second")
axes[1].grid(True)
axes[1].legend()

plt.tight_layout()
plt.savefig("benchmark_g6e_2xlarge_compared.png")
plt.show()

In [None]:
predictor_v14.delete_endpoint()
predictor_v15.delete_endpoint()