# Load tests with K6


In [1]:
import os
from dotenv import load_dotenv

%load_ext autoreload
%autoreload 2

load_dotenv(override=True)

True

## 0. Lookup compatible configs


First, get IE compute instance options


In [2]:
from autobench.compute_manager import ComputeManager

In [3]:
compute_manager = ComputeManager()
compute_manager.options.head()

Unnamed: 0,vendor,vendor_status,region,region_label,region_status,id,accelerator,num_gpus,memory_in_gb,gpu_memory_in_gb,instance_type,instance_size,architecture,status,price_per_hour,num_cpus
0,aws,available,us-east-1,N. Virginia,available,aws-us-east-1-nvidia-t4-x1,gpu,1,15,16,nvidia-t4,x1,Nvidia T4,available,0.5,3
1,aws,available,us-east-1,N. Virginia,available,aws-us-east-1-nvidia-t4-x4,gpu,4,192,64,nvidia-t4,x4,Nvidia T4,available,3.0,46
2,aws,available,us-east-1,N. Virginia,available,aws-us-east-1-nvidia-a10g-x1,gpu,1,30,24,nvidia-a10g,x1,Nvidia A10G,available,1.0,6
3,aws,available,us-east-1,N. Virginia,available,aws-us-east-1-nvidia-a10g-x4,gpu,4,186,96,nvidia-a10g,x4,Nvidia A10G,available,5.0,46
4,aws,available,us-east-1,N. Virginia,available,aws-us-east-1-nvidia-a100-x1,gpu,1,145,80,nvidia-a100,x1,Nvidia A100,available,4.0,11


In [4]:
compute_manager.options.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   vendor            21 non-null     object 
 1   vendor_status     21 non-null     object 
 2   region            21 non-null     object 
 3   region_label      21 non-null     object 
 4   region_status     21 non-null     object 
 5   id                21 non-null     object 
 6   accelerator       21 non-null     object 
 7   num_gpus          21 non-null     int64  
 8   memory_in_gb      21 non-null     int64  
 9   gpu_memory_in_gb  21 non-null     int64  
 10  instance_type     21 non-null     object 
 11  instance_size     21 non-null     object 
 12  architecture      21 non-null     object 
 13  status            21 non-null     object 
 14  price_per_hour    21 non-null     float64
 15  num_cpus          21 non-null     int64  
dtypes: float64(1), int64(4), object(11)
memory usa

User specifies their desired inputs


In [5]:
VENDOR = "aws"
REGION = "us-east-1"
GPU_TYPES = ["nvidia-a10g", "nvidia-l4"]

In [6]:
possible_instances = compute_manager.get_instance_details(
    vendor=VENDOR, region=REGION, gpu_types=GPU_TYPES
)

In [7]:
len(possible_instances)

4

Then, check if model will work on any of the desired instances, and if so, get each TGI config


In [8]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

viable_instances = compute_manager.get_viable_instance_configs(
    model_id=model_id, instances=possible_instances
)

In [9]:
viable_instances

[{'tgi_config': TGIConfig(model_id='meta-llama/Meta-Llama-3-8B-Instruct', max_batch_prefill_tokens=8192, max_input_length=6000, max_total_tokens=6144, num_shard=1, quantize=None, estimated_memory_in_gigabytes=23.01),
  'instance_config': ComputeInstanceConfig(id='aws-us-east-1-nvidia-a10g-x1', vendor='aws', vendor_status='available', region='us-east-1', region_label='N. Virginia', region_status='available', accelerator='gpu', num_gpus=1, memory_in_gb=30, gpu_memory_in_gb=24, instance_type='nvidia-a10g', instance_size='x1', architecture='Nvidia A10G', status='available', price_per_hour=1.0, num_cpus=6)},
 {'tgi_config': TGIConfig(model_id='meta-llama/Meta-Llama-3-8B-Instruct', max_batch_prefill_tokens=32768, max_input_length=6000, max_total_tokens=6144, num_shard=4, quantize=None, estimated_memory_in_gigabytes=98.36),
  'instance_config': ComputeInstanceConfig(id='aws-us-east-1-nvidia-a10g-x4', vendor='aws', vendor_status='available', region='us-east-1', region_label='N. Virginia', regi

## 1. Deploy LLM with TGI on Inference Endpoints


In [10]:
from autobench.deployment import Deployment
from autobench.config import DeploymentConfig

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
viable_instance = viable_instances[0]
viable_instance

{'tgi_config': TGIConfig(model_id='meta-llama/Meta-Llama-3-8B-Instruct', max_batch_prefill_tokens=8192, max_input_length=6000, max_total_tokens=6144, num_shard=1, quantize=None, estimated_memory_in_gigabytes=23.01),
 'instance_config': ComputeInstanceConfig(id='aws-us-east-1-nvidia-a10g-x1', vendor='aws', vendor_status='available', region='us-east-1', region_label='N. Virginia', region_status='available', accelerator='gpu', num_gpus=1, memory_in_gb=30, gpu_memory_in_gb=24, instance_type='nvidia-a10g', instance_size='x1', architecture='Nvidia A10G', status='available', price_per_hour=1.0, num_cpus=6)}

In [12]:
viable_instance["tgi_config"].env_vars

{'MAX_BATCH_PREFILL_TOKENS': '8192',
 'MAX_INPUT_LENGTH': '6000',
 'MAX_TOTAL_TOKENS': '6144',
 'NUM_SHARD': '1',
 'MODEL_ID': '/repository'}

In [13]:
deployment_config = DeploymentConfig(
    tgi_config=viable_instance["tgi_config"],
    instance_config=viable_instance["instance_config"],
)

In [14]:
# deployment = Deployment(deployment_config)

deployment = Deployment(
    deployment_config, existing_endpoint_name="d30f3a82-95b9-44e8-b0ca-617d17cc"
)

Getting existing endpoint d30f3a82-95b9-44e8-b0ca-617d17cc
Endpoint found.
Endpoint status: running


In [15]:
deployment.deployment_id, deployment.endpoint.status, deployment.endpoint.url

('d30f3a82-95b9-44e8-b0ca-617d17cc',
 'running',
 'https://ehdndzfif3v55m3w.us-east-1.aws.endpoints.huggingface.cloud')

## 2. Prepare ShareGPT data for realisitic inference workload


In [16]:
from autobench.config import DataConfig
from autobench.data import BenchmarkDataset

data_config = DataConfig()
benchmark_dataset = BenchmarkDataset(data_config)
benchmark_dataset.build_data()

100%|█████████▉| 1999/2000 [00:00<00:00, 48205.68it/s]


In [17]:
data_config.file_path

'benchmark_data/data.json'

## 2. Run K6 Load Test


In [18]:
# If necessary, install the openai Python library by running
# pip install openai

import requests
from huggingface_hub import get_token


API_URL = f"{deployment.endpoint.url}" + "/v1/chat/completions"
headers = {
    "Accept": "application/json",
    "Authorization": f"Bearer {get_token()}",
    "Content-Type": "application/json",
}


def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response


output = query(
    {
        "messages": [{"role": "user", "content": "How old is Statute of Liberty?"}],
        "max_tokens": 150,
        "model": "tgi",
    }
)

In [19]:
print(output.content)

b'{"object":"chat.completion","id":"","created":1724960951,"model":"/repository","system_fingerprint":"2.2.1-dev0-sha-d9fbbaa","choices":[{"index":0,"message":{"role":"assistant","content":"The Statue of Liberty, officially named \\"Liberty Enlightening the World,\\" was dedicated on October 28, 1886. It was designed by French sculptor Fr\xc3\xa9d\xc3\xa9ric Auguste Bartholdi and was a gift from the people of France to the people of the United States.\\n\\nThe statue was disassembled and shipped from France to New York Harbor, where it was reassembled on Bedloe\'s Island (now known as Liberty Island). The pedestal, designed by American architect Richard Morris Hunt, was completed in 1886, and the statue was officially dedicated on October 28, 1886.\\n\\nSo, as of 2023, the Statue of Liberty is 137 years old."},"logprobs":null,"finish_reason":"stop"}],"usage":{"prompt_tokens":18,"completion_tokens":143,"total_tokens":161}}'


In [20]:
from autobench.runner import (
    K6ConstantArrivalRateExecutor,
    Scenario,
)

In [21]:
executor = K6ConstantArrivalRateExecutor(
    pre_allocated_vus=10, rate_per_second=1, duration="10s"
)

In [22]:
benchmark_dataset.file_path

'/Users/andrewreed/Documents/success_projects/auto-bench/benchmark_data/data.json'

In [23]:
scenario = Scenario(
    host=deployment.endpoint.url,
    executor=executor,
    data_file=benchmark_dataset.file_path,
    output_dir=os.path.abspath("../autobench/benchmark_results"),
)

In [24]:
scenario.run()

Preparing scenario 79f58a9a-a2b8-47d7-a2ca-a4d60fc19eb4
Running scenario 79f58a9a-a2b8-47d7-a2ca-a4d60fc19eb4

          /\      |‾‾| /‾‾/   /‾‾/   
     /\  /  \     |  |/  /   /  /    
    /  \/    \    |     (   /   ‾‾\  
   /          \   |  |\  \ |  (‾)  | 
  / __________ \  |__| \__\ \_____/ .io

     execution: local
        script: /var/folders/w0/6t9rxkj97rv47l9sc0q22yth0000gn/T/autobench_emoz85zk_k6_script.js
        output: json (/Users/andrewreed/Documents/success_projects/auto-bench/autobench/benchmark_results/scenario_79f58a9a-a2b8-47d7-a2ca-a4d60fc19eb4/results.json)

     scenarios: (100.00%) 1 scenario, 10 max VUs, 40s max duration (incl. graceful stop):
              * load_test: 1.00 iterations/s for 10s (maxVUs: 10, gracefulStop: 30s)


running (01.0s), 01/10 VUs, 0 complete and 0 interrupted iterations
load_test   [  10% ] 01/10 VUs  01.0s/10s  1.00 iters/s

running (02.0s), 02/10 VUs, 0 complete and 0 interrupted iterations
load_test   [  20% ] 02/10 VUs  02.0s/10

In [25]:
scenario.scenario_id

'79f58a9a-a2b8-47d7-a2ca-a4d60fc19eb4'

## Benchmark Runner

In [26]:
from autobench.runner import BenchmarkRunner

In [27]:
benchmark_runner = BenchmarkRunner(
    deployment=deployment, benchmark_dataset=benchmark_dataset
)

In [28]:
deployment.endpoint.url

'https://ehdndzfif3v55m3w.us-east-1.aws.endpoints.huggingface.cloud'

In [29]:
benchmark_runner.run_benchmark()

Running benchmark for deployment d30f3a82-95b9-44e8-b0ca-617d17cc
Running benchmark for arrival rate 1
Preparing scenario fa6befa6-cc3a-43eb-b298-b5f301ca9a6a
Running scenario fa6befa6-cc3a-43eb-b298-b5f301ca9a6a

          /\      |‾‾| /‾‾/   /‾‾/   
     /\  /  \     |  |/  /   /  /    
    /  \/    \    |     (   /   ‾‾\  
   /          \   |  |\  \ |  (‾)  | 
  / __________ \  |__| \__\ \_____/ .io

     execution: local
        script: /var/folders/w0/6t9rxkj97rv47l9sc0q22yth0000gn/T/autobench_uky___vy_k6_script.js
        output: json (/Users/andrewreed/Documents/success_projects/auto-bench/autobench/benchmark_results/deployment_d30f3a82-95b9-44e8-b0ca-617d17cc/scenario_fa6befa6-cc3a-43eb-b298-b5f301ca9a6a/results.json)

     scenarios: (100.00%) 1 scenario, 50 max VUs, 35s max duration (incl. graceful stop):
              * load_test: 1.00 iterations/s for 5s (maxVUs: 50, gracefulStop: 30s)


running (00.9s), 01/50 VUs, 0 complete and 0 interrupted iterations
load_test   [  17% 

## Scheduler

In [34]:
from huggingface_hub import HfApi

In [35]:
hf_api = HfApi()

In [36]:
hf_api._get_namespace()

'andrewrreed'

In [None]:
hf_api.list

In [56]:
from huggingface_hub.constants import INFERENCE_ENDPOINTS_ENDPOINT
from huggingface_hub.utils import get_session, hf_raise_for_status, build_hf_headers


def fetch_quotas(namespace: str):
    session = get_session()
    response = session.get(
        f"{INFERENCE_ENDPOINTS_ENDPOINT}/provider/quotas/{namespace}",
        headers=build_hf_headers(),
    )
    hf_raise_for_status(response)

    return response.json()

In [57]:
quota = fetch_quotas(namespace="andrewrreed")

In [58]:
quota

{'vendors': [{'name': 'aws',
   'quotas': [{'instanceType': 'nvidia-a10g',
     'architecture': 'Nvidia A10G',
     'maxAccelerators': 8,
     'usedAccelerators': 3},
    {'instanceType': 'nvidia-l4',
     'architecture': 'Nvidia L4',
     'maxAccelerators': 8,
     'usedAccelerators': 0},
    {'instanceType': 'nvidia-a100',
     'architecture': 'Nvidia A100',
     'maxAccelerators': 2,
     'usedAccelerators': 0},
    {'instanceType': 'intel-icl',
     'architecture': 'Intel Ice Lake',
     'maxAccelerators': 40,
     'usedAccelerators': 0},
    {'instanceType': 'nvidia-t4',
     'architecture': 'Nvidia T4',
     'maxAccelerators': 15,
     'usedAccelerators': 0},
    {'instanceType': 'intel-spr',
     'architecture': 'Intel Sapphire Rapids',
     'maxAccelerators': 40,
     'usedAccelerators': 0},
    {'instanceType': 'inf2',
     'architecture': 'AWS Inferentia 2',
     'maxAccelerators': 24,
     'usedAccelerators': 0}]},
  {'name': 'azure',
   'quotas': [{'instanceType': 'intel-xe

aScheduler design:

Needs to:

1. Take in:
   - model_id
   - GPU types
   - Optionally, provider (AWS/GCP)
   - Optionally, specify (MAX_INPUT_TOKENS, MAX_TOTAL_TOKENS, MAX_BATCH_PREFILL_TOKENS, MAX_BATCH_TOTAL_TOKENS)
2. Determine which instances can fit the desired model
3. Iterate through instances to:
   a. deploy
   b. benchmark
   c. report results

Runner design:

1. Take in model_id, instance, TGI config
2. Deploy this
3. Shut down / pause instance
