In [1]:
!nvidia-smi

Thu Apr 25 16:02:10 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.154.05             Driver Version: 535.154.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA H100 80GB HBM3          On  | 00000000:0A:00.0 Off |                    0 |
| N/A   31C    P0              72W / 700W |      0MiB / 81559MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
# fix numpy in colab
import numpy
from IPython.display import clear_output

!pip install huggingface_hub
!huggingface-cli login
!huggingface-cli download lavawolfiee/Mixtral-8x7B-Instruct-v0.1-offloading-demo --quiet --cache-dir $TMP_DIR --local-dir Mixtral-8x7B-Instruct-v0.1-offloading-demo



In [3]:

import os, sys
script_dir = os.getcwd()
module_path = script_dir
for _ in range(1):
    module_path = os.path.abspath(os.path.join(module_path, '../'))
    if module_path not in sys.path:
        sys.path.insert(0,module_path)
        
sys.path.append("mixtral-offloading")
import torch
from torch.nn import functional as F
from hqq.core.quantize import BaseQuantizeConfig
from huggingface_hub import snapshot_download
from IPython.display import clear_output
from tqdm.auto import trange
from transformers import AutoConfig, AutoTokenizer
from transformers.utils import logging as hf_logging
import time
import gc
from src.build_model import OffloadConfig, QuantConfig, build_model

[36mhqq_aten package not installed. HQQBackend.ATEN backend will not work unless you install the hqq_aten lib in hqq/kernels.[0m


In [4]:
# This will reload the imported modules (e.g. get_decode_model_characterstics) every time you execute the jupyter cells, so that you don't need to restart the notebook after updating the source codes.
%load_ext autoreload
%autoreload 2  

In [5]:
%reload_ext autoreload

In [6]:
import random

# List of prompts
prompts = [
    "What are the main causes of climate change?",
    "How do you bake a chocolate cake from scratch?",
    "What is the plot of Shakespeare's 'Macbeth'?",
    "Can you explain the theory of relativity in simple terms?",
    "What are the health benefits of daily exercise?",
    "How does a blockchain work?",
    "What are the top tourist attractions in Paris?",
    "What is the process of photosynthesis in plants?",
    "How can one improve their time management skills?",
    "What are the rules of chess?"
]

# Sampling and printing the selected prompts
def get_prompts(b: int = 2):
    random.shuffle(prompts)
    return prompts[:b]

In [11]:
from transformers import TextStreamer

def run_for_config(offload_per_layer: int, attn_bits: int, ffn_bits: int, routing_strategy: str, alpha: float, n: int = 5, b_size:int = 1):
    print (f"Running with configs: {offload_per_layer}, {attn_bits}, {ffn_bits}, {routing_strategy}, {alpha}, {n}, {b_size}")
    model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
    quantized_model_name = "lavawolfiee/Mixtral-8x7B-Instruct-v0.1-offloading-demo"
    state_path = "Mixtral-8x7B-Instruct-v0.1-offloading-demo"
    
    config = AutoConfig.from_pretrained(quantized_model_name)
    
    device = torch.device("cuda:0")
    
    ##### Change this to 5 if you have only 12 GB of GPU VRAM #####
    # offload_per_layer = 5
    ###############################################################
    
    num_experts = config.num_local_experts
    
    offload_config = OffloadConfig(
        main_size=config.num_hidden_layers * (num_experts - offload_per_layer),
        offload_size=config.num_hidden_layers * offload_per_layer,
        buffer_size=4,
        offload_per_layer=offload_per_layer,
    )
    
    attn_config = BaseQuantizeConfig(
        nbits=attn_bits,
        group_size=64,
        quant_zero=True,
        quant_scale=True,
    )
    attn_config["scale_quant_params"]["group_size"] = 256
    
    
    ffn_config = BaseQuantizeConfig(
        nbits=ffn_bits,
        group_size=16,
        quant_zero=True,
        quant_scale=True,
    )
    quant_config = QuantConfig(ffn_config=ffn_config, attn_config=attn_config)

    if 'model' in locals():
        print("Deleting existing model")
        del model
    
    model = build_model(
        device=device,
        quant_config=quant_config,
        offload_config=offload_config,
        state_path=state_path,
        routing_strategy=routing_strategy,
        routing_threshold=alpha
    )

    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    past_key_values = None
    sequence = None
    
    seq_len = 0
    # while True:
    #user_input = "Where is Georgia Tech? What is the name of its mascot?"
    #user_entry = dict(role="user", content=user_input)
    batch_prompts = get_prompts(b_size)
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    inputs = tokenizer(batch_prompts, return_tensors="pt", padding=True)
    input_ids = inputs['input_ids'].to(model.device)
    print (f"Batch: {batch_prompts} | Input_ids: {input_ids.shape}")
    
    if past_key_values is None:
      attention_mask = torch.ones(input_ids.shape).long().to(device=model.device)
    else:
      seq_len = input_ids.size(1) + past_key_values[0][0][0].size(1)
      attention_mask = torch.ones([1, seq_len - 1], dtype=torch.int, device=model.device)

    results = []
    
    for idx in range(n):
        start_time = time.time()
        result = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            # streamer=streamer,
            do_sample=False,
            # temperature=0.9,
            # top_p=0.9,
            min_new_tokens=100,
            max_new_tokens=100,
            pad_token_id=tokenizer.eos_token_id,
            return_dict_in_generate=True,
            # output_hidden_states=True,
            # decoder_router_logits=True, 
            output_router_logits=True,
            # output_logits = False
        )
        latency = time.time() - start_time
        total_experts_saved = 0
        for i in result['router_logits'][-32:]:
            total_experts_saved += i[1]
        total_experts_saved 
        
        Num_tokens = result['sequences'].shape[1] - input_ids.shape[1]
        print(f"Total Latency :{latency} sec, Throughput:{Num_tokens/latency}, Expert Loads saved:{total_experts_saved} ")

        results.append({"latency": latency, "throughput": Num_tokens/latency, "expert_loads_saved": total_experts_saved})

    return results

In [12]:
run_configs = [
    {"offload_per_layer":3, "attn_bits":4, "ffn_bits": 2, "routing_strategy":'TOP-K', "alpha": 0.05, "n": 50},
    {"offload_per_layer":4, "attn_bits":4, "ffn_bits": 2, "routing_strategy":'TOP-K', "alpha": 0.05, "n": 50},
    {"offload_per_layer":5, "attn_bits":4, "ffn_bits": 2, "routing_strategy":'TOP-K', "alpha": 0.05, "n": 50},
    {"offload_per_layer":6, "attn_bits":4, "ffn_bits": 2, "routing_strategy":'TOP-K', "alpha": 0.05, "n": 50},
    {"offload_per_layer":3, "attn_bits":4, "ffn_bits": 2, "routing_strategy":'THRESHOLDING', "alpha": 0.05, "n": 50},
    {"offload_per_layer":4, "attn_bits":4, "ffn_bits": 2, "routing_strategy":'THRESHOLDING', "alpha": 0.05, "n": 50},
    {"offload_per_layer":5, "attn_bits":4, "ffn_bits": 2, "routing_strategy":'THRESHOLDING', "alpha": 0.05, "n": 50},
    {"offload_per_layer":6, "attn_bits":4, "ffn_bits": 2, "routing_strategy":'THRESHOLDING', "alpha": 0.05, "n": 50},
    {"offload_per_layer":3, "attn_bits":4, "ffn_bits": 2, "routing_strategy":'THRESHOLDING', "alpha": 0.15, "n": 50},
    {"offload_per_layer":4, "attn_bits":4, "ffn_bits": 2, "routing_strategy":'THRESHOLDING', "alpha": 0.15, "n": 50},
    {"offload_per_layer":5, "attn_bits":4, "ffn_bits": 2, "routing_strategy":'THRESHOLDING', "alpha": 0.15, "n": 50},
    {"offload_per_layer":6, "attn_bits":4, "ffn_bits": 2, "routing_strategy":'THRESHOLDING', "alpha": 0.15, "n": 50},
    {"offload_per_layer":3, "attn_bits":4, "ffn_bits": 2, "routing_strategy":'THRESHOLDING', "alpha": 0.25, "n": 50},
    {"offload_per_layer":4, "attn_bits":4, "ffn_bits": 2, "routing_strategy":'THRESHOLDING', "alpha": 0.25, "n": 50},
    {"offload_per_layer":5, "attn_bits":4, "ffn_bits": 2, "routing_strategy":'THRESHOLDING', "alpha": 0.25, "n": 50},
    {"offload_per_layer":6, "attn_bits":4, "ffn_bits": 2, "routing_strategy":'THRESHOLDING', "alpha": 0.25, "n": 50},
    {"offload_per_layer":3, "attn_bits":4, "ffn_bits": 2, "routing_strategy":'BIASING', "alpha": 0.05, "n": 50},
    {"offload_per_layer":4, "attn_bits":4, "ffn_bits": 2, "routing_strategy":'BIASING', "alpha": 0.05, "n": 50},
    {"offload_per_layer":5, "attn_bits":4, "ffn_bits": 2, "routing_strategy":'BIASING', "alpha": 0.05, "n": 50},
    {"offload_per_layer":6, "attn_bits":4, "ffn_bits": 2, "routing_strategy":'BIASING', "alpha": 0.05, "n": 50},
]

print (len(run_configs))

20


In [13]:
profiling_results = []
from tqdm import tqdm
import pickle

for i, rc in tqdm(enumerate(run_configs)):
    result = run_for_config(**rc) # run_for_config(offload_per_layer=4, attn_bits=4, ffn_bits=2, routing_strategy='BIASING', alpha=0.05, n= 3)
    print (f"Config: {i} Result: {result}")
    profiling_results.append(result)

    with open("profiling_results.pkl", "wb") as fp:
        pickle.dump(profiling_results, fp)



Running with configs: 3, 4, 2, TOP-K, 0.05, 50, 1


Loading experts:   0%|          | 0/32 [00:00<?, ?it/s]

Batch: ["What is the plot of Shakespeare's 'Macbeth'?"] | Input_ids: torch.Size([1, 15])
Total Latency :18.18439745903015 sec, Throughput:5.499219879311492, Expert Loads saved:0 
Total Latency :18.162853956222534 sec, Throughput:5.505742668031547, Expert Loads saved:0 
Total Latency :18.160635232925415 sec, Throughput:5.506415316282494, Expert Loads saved:0 
Total Latency :18.17613410949707 sec, Throughput:5.501719969580868, Expert Loads saved:0 
Total Latency :18.319108486175537 sec, Throughput:5.4587809267828025, Expert Loads saved:0 
Total Latency :18.17943525314331 sec, Throughput:5.500720930410065, Expert Loads saved:0 
Total Latency :18.221656322479248 sec, Throughput:5.487975309721676, Expert Loads saved:0 
Total Latency :18.224007606506348 sec, Throughput:5.487267244351782, Expert Loads saved:0 
Total Latency :18.28162169456482 sec, Throughput:5.469974254512131, Expert Loads saved:0 
Total Latency :18.214136838912964 sec, Throughput:5.4902409531896375, Expert Loads saved:0 
Tot

1it [15:33, 933.96s/it]

Total Latency :18.159271717071533 sec, Throughput:5.506828773644594, Expert Loads saved:0 
Config: 0 Result: [{'latency': 18.18439745903015, 'throughput': 5.499219879311492, 'expert_loads_saved': 0}, {'latency': 18.162853956222534, 'throughput': 5.505742668031547, 'expert_loads_saved': 0}, {'latency': 18.160635232925415, 'throughput': 5.506415316282494, 'expert_loads_saved': 0}, {'latency': 18.17613410949707, 'throughput': 5.501719969580868, 'expert_loads_saved': 0}, {'latency': 18.319108486175537, 'throughput': 5.4587809267828025, 'expert_loads_saved': 0}, {'latency': 18.17943525314331, 'throughput': 5.500720930410065, 'expert_loads_saved': 0}, {'latency': 18.221656322479248, 'throughput': 5.487975309721676, 'expert_loads_saved': 0}, {'latency': 18.224007606506348, 'throughput': 5.487267244351782, 'expert_loads_saved': 0}, {'latency': 18.28162169456482, 'throughput': 5.469974254512131, 'expert_loads_saved': 0}, {'latency': 18.214136838912964, 'throughput': 5.4902409531896375, 'expert_

Loading experts:   0%|          | 0/32 [00:00<?, ?it/s]

Batch: ['What are the health benefits of daily exercise?'] | Input_ids: torch.Size([1, 10])
Total Latency :19.534354209899902 sec, Throughput:5.119186379313248, Expert Loads saved:0 
Total Latency :19.483198881149292 sec, Throughput:5.132627378595086, Expert Loads saved:0 
Total Latency :19.5037260055542 sec, Throughput:5.127225432285214, Expert Loads saved:0 
Total Latency :19.46017551422119 sec, Throughput:5.1386997988235805, Expert Loads saved:0 
Total Latency :19.734111070632935 sec, Throughput:5.067367850625597, Expert Loads saved:0 
Total Latency :19.559525966644287 sec, Throughput:5.112598340600604, Expert Loads saved:0 
Total Latency :19.573420763015747 sec, Throughput:5.108969004996378, Expert Loads saved:0 
Total Latency :19.60790228843689 sec, Throughput:5.099984614823978, Expert Loads saved:0 
Total Latency :19.64103078842163 sec, Throughput:5.09138247769307, Expert Loads saved:0 
Total Latency :19.58775281906128 sec, Throughput:5.105230851325006, Expert Loads saved:0 
Tota

2it [32:15, 973.53s/it]

Total Latency :19.50558638572693 sec, Throughput:5.126736413993392, Expert Loads saved:0 
Config: 1 Result: [{'latency': 19.534354209899902, 'throughput': 5.119186379313248, 'expert_loads_saved': 0}, {'latency': 19.483198881149292, 'throughput': 5.132627378595086, 'expert_loads_saved': 0}, {'latency': 19.5037260055542, 'throughput': 5.127225432285214, 'expert_loads_saved': 0}, {'latency': 19.46017551422119, 'throughput': 5.1386997988235805, 'expert_loads_saved': 0}, {'latency': 19.734111070632935, 'throughput': 5.067367850625597, 'expert_loads_saved': 0}, {'latency': 19.559525966644287, 'throughput': 5.112598340600604, 'expert_loads_saved': 0}, {'latency': 19.573420763015747, 'throughput': 5.108969004996378, 'expert_loads_saved': 0}, {'latency': 19.60790228843689, 'throughput': 5.099984614823978, 'expert_loads_saved': 0}, {'latency': 19.64103078842163, 'throughput': 5.09138247769307, 'expert_loads_saved': 0}, {'latency': 19.58775281906128, 'throughput': 5.105230851325006, 'expert_loads

Loading experts:   0%|          | 0/32 [00:00<?, ?it/s]

Batch: ['What are the main causes of climate change?'] | Input_ids: torch.Size([1, 10])
Total Latency :21.466485023498535 sec, Throughput:4.658424511070809, Expert Loads saved:0 
Total Latency :21.47875142097473 sec, Throughput:4.655764110307948, Expert Loads saved:0 
Total Latency :21.50893998146057 sec, Throughput:4.6492295801742936, Expert Loads saved:0 
Total Latency :21.475234031677246 sec, Throughput:4.656526669394804, Expert Loads saved:0 
Total Latency :21.511430978775024 sec, Throughput:4.64869120509316, Expert Loads saved:0 
Total Latency :21.477372407913208 sec, Throughput:4.656063046294975, Expert Loads saved:0 
Total Latency :21.50679349899292 sec, Throughput:4.649693595871584, Expert Loads saved:0 
Total Latency :21.48401403427124 sec, Throughput:4.6546236583387195, Expert Loads saved:0 
Total Latency :21.50493359565735 sec, Throughput:4.650095735249968, Expert Loads saved:0 
Total Latency :21.64649510383606 sec, Throughput:4.619685520464632, Expert Loads saved:0 
Total L

3it [50:28, 1028.25s/it]

Total Latency :21.405557870864868 sec, Throughput:4.671683896457103, Expert Loads saved:0 
Config: 2 Result: [{'latency': 21.466485023498535, 'throughput': 4.658424511070809, 'expert_loads_saved': 0}, {'latency': 21.47875142097473, 'throughput': 4.655764110307948, 'expert_loads_saved': 0}, {'latency': 21.50893998146057, 'throughput': 4.6492295801742936, 'expert_loads_saved': 0}, {'latency': 21.475234031677246, 'throughput': 4.656526669394804, 'expert_loads_saved': 0}, {'latency': 21.511430978775024, 'throughput': 4.64869120509316, 'expert_loads_saved': 0}, {'latency': 21.477372407913208, 'throughput': 4.656063046294975, 'expert_loads_saved': 0}, {'latency': 21.50679349899292, 'throughput': 4.649693595871584, 'expert_loads_saved': 0}, {'latency': 21.48401403427124, 'throughput': 4.6546236583387195, 'expert_loads_saved': 0}, {'latency': 21.50493359565735, 'throughput': 4.650095735249968, 'expert_loads_saved': 0}, {'latency': 21.64649510383606, 'throughput': 4.619685520464632, 'expert_loa

Loading experts:   0%|          | 0/32 [00:00<?, ?it/s]

Batch: ['How does a blockchain work?'] | Input_ids: torch.Size([1, 8])
Total Latency :23.385247945785522 sec, Throughput:4.276200116921229, Expert Loads saved:0 
Total Latency :23.392844915390015 sec, Throughput:4.274811394753042, Expert Loads saved:0 
Total Latency :23.398696184158325 sec, Throughput:4.273742400557483, Expert Loads saved:0 
Total Latency :23.420541286468506 sec, Throughput:4.269756141706946, Expert Loads saved:0 
Total Latency :23.39938521385193 sec, Throughput:4.273616553857242, Expert Loads saved:0 
Total Latency :23.42197060585022 sec, Throughput:4.269495581000452, Expert Loads saved:0 
Total Latency :23.39749813079834 sec, Throughput:4.273961234700093, Expert Loads saved:0 
Total Latency :23.423492193222046 sec, Throughput:4.269218235056195, Expert Loads saved:0 
Total Latency :23.407596111297607 sec, Throughput:4.272117458132973, Expert Loads saved:0 
Total Latency :23.414855480194092 sec, Throughput:4.270792962381806, Expert Loads saved:0 
Total Latency :23.4083

4it [1:10:23, 1093.91s/it]

Total Latency :23.609241008758545 sec, Throughput:4.235629597872378, Expert Loads saved:0 
Config: 3 Result: [{'latency': 23.385247945785522, 'throughput': 4.276200116921229, 'expert_loads_saved': 0}, {'latency': 23.392844915390015, 'throughput': 4.274811394753042, 'expert_loads_saved': 0}, {'latency': 23.398696184158325, 'throughput': 4.273742400557483, 'expert_loads_saved': 0}, {'latency': 23.420541286468506, 'throughput': 4.269756141706946, 'expert_loads_saved': 0}, {'latency': 23.39938521385193, 'throughput': 4.273616553857242, 'expert_loads_saved': 0}, {'latency': 23.42197060585022, 'throughput': 4.269495581000452, 'expert_loads_saved': 0}, {'latency': 23.39749813079834, 'throughput': 4.273961234700093, 'expert_loads_saved': 0}, {'latency': 23.423492193222046, 'throughput': 4.269218235056195, 'expert_loads_saved': 0}, {'latency': 23.407596111297607, 'throughput': 4.272117458132973, 'expert_loads_saved': 0}, {'latency': 23.414855480194092, 'throughput': 4.270792962381806, 'expert_l

Loading experts:   0%|          | 0/32 [00:00<?, ?it/s]

Batch: ['How can one improve their time management skills?'] | Input_ids: torch.Size([1, 10])
Total Latency :17.324288845062256 sec, Throughput:5.772242710470731, Expert Loads saved:409 
Total Latency :17.399637460708618 sec, Throughput:5.747246184055113, Expert Loads saved:815 
Total Latency :17.484240770339966 sec, Throughput:5.719436223369715, Expert Loads saved:1222 
Total Latency :17.472959518432617 sec, Throughput:5.72312892355229, Expert Loads saved:1623 
Total Latency :17.508854150772095 sec, Throughput:5.711396025055715, Expert Loads saved:2024 
Total Latency :17.56312131881714 sec, Throughput:5.69374874686198, Expert Loads saved:2425 
Total Latency :17.688491582870483 sec, Throughput:5.653393311210318, Expert Loads saved:2826 
Total Latency :17.384219646453857 sec, Throughput:5.752343333995934, Expert Loads saved:3227 
Total Latency :17.350229263305664 sec, Throughput:5.7636126002952555, Expert Loads saved:3628 
Total Latency :17.35158109664917 sec, Throughput:5.7631635666510

5it [1:25:16, 1021.72s/it]

Total Latency :17.451910495758057 sec, Throughput:5.730031679013393, Expert Loads saved:20069 
Config: 4 Result: [{'latency': 17.324288845062256, 'throughput': 5.772242710470731, 'expert_loads_saved': 409}, {'latency': 17.399637460708618, 'throughput': 5.747246184055113, 'expert_loads_saved': 815}, {'latency': 17.484240770339966, 'throughput': 5.719436223369715, 'expert_loads_saved': 1222}, {'latency': 17.472959518432617, 'throughput': 5.72312892355229, 'expert_loads_saved': 1623}, {'latency': 17.508854150772095, 'throughput': 5.711396025055715, 'expert_loads_saved': 2024}, {'latency': 17.56312131881714, 'throughput': 5.69374874686198, 'expert_loads_saved': 2425}, {'latency': 17.688491582870483, 'throughput': 5.653393311210318, 'expert_loads_saved': 2826}, {'latency': 17.384219646453857, 'throughput': 5.752343333995934, 'expert_loads_saved': 3227}, {'latency': 17.350229263305664, 'throughput': 5.7636126002952555, 'expert_loads_saved': 3628}, {'latency': 17.35158109664917, 'throughput':

Loading experts:   0%|          | 0/32 [00:00<?, ?it/s]

Batch: ['How can one improve their time management skills?'] | Input_ids: torch.Size([1, 10])
Total Latency :19.21473526954651 sec, Throughput:5.204339200992807, Expert Loads saved:478 
Total Latency :19.02950429916382 sec, Throughput:5.254997630410905, Expert Loads saved:980 
Total Latency :18.923544883728027 sec, Throughput:5.284422163734659, Expert Loads saved:1489 
Total Latency :19.078222513198853 sec, Throughput:5.241578450550998, Expert Loads saved:1998 
Total Latency :18.92999243736267 sec, Throughput:5.282622290045247, Expert Loads saved:2507 
Total Latency :18.937790632247925 sec, Throughput:5.280447014221212, Expert Loads saved:3016 
Total Latency :18.9459285736084 sec, Throughput:5.278178876874876, Expert Loads saved:3525 
Total Latency :18.967052459716797 sec, Throughput:5.272300491200999, Expert Loads saved:4034 
Total Latency :18.96513295173645 sec, Throughput:5.272834113764754, Expert Loads saved:4543 
Total Latency :18.92801308631897 sec, Throughput:5.2831747074540685,

6it [1:41:36, 1007.39s/it]

Total Latency :19.667511463165283 sec, Throughput:5.084527353004834, Expert Loads saved:25412 
Config: 5 Result: [{'latency': 19.21473526954651, 'throughput': 5.204339200992807, 'expert_loads_saved': 478}, {'latency': 19.02950429916382, 'throughput': 5.254997630410905, 'expert_loads_saved': 980}, {'latency': 18.923544883728027, 'throughput': 5.284422163734659, 'expert_loads_saved': 1489}, {'latency': 19.078222513198853, 'throughput': 5.241578450550998, 'expert_loads_saved': 1998}, {'latency': 18.92999243736267, 'throughput': 5.282622290045247, 'expert_loads_saved': 2507}, {'latency': 18.937790632247925, 'throughput': 5.280447014221212, 'expert_loads_saved': 3016}, {'latency': 18.9459285736084, 'throughput': 5.278178876874876, 'expert_loads_saved': 3525}, {'latency': 18.967052459716797, 'throughput': 5.272300491200999, 'expert_loads_saved': 4034}, {'latency': 18.96513295173645, 'throughput': 5.272834113764754, 'expert_loads_saved': 4543}, {'latency': 18.92801308631897, 'throughput': 5.2

Loading experts:   0%|          | 0/32 [00:00<?, ?it/s]

Batch: ['What are the rules of chess?'] | Input_ids: torch.Size([1, 9])
Total Latency :21.529940605163574 sec, Throughput:4.6446946526186315, Expert Loads saved:571 
Total Latency :21.462987899780273 sec, Throughput:4.6591835427081305, Expert Loads saved:1134 
Total Latency :21.48137402534485 sec, Throughput:4.655195700331588, Expert Loads saved:1697 
Total Latency :21.544202089309692 sec, Throughput:4.6416200324086425, Expert Loads saved:2260 
Total Latency :21.462239742279053 sec, Throughput:4.6593459583347805, Expert Loads saved:2823 
Total Latency :21.573243379592896 sec, Throughput:4.63537161475657, Expert Loads saved:3386 
Total Latency :21.555057764053345 sec, Throughput:4.639282394629751, Expert Loads saved:3949 
Total Latency :21.45755100250244 sec, Throughput:4.660364082943935, Expert Loads saved:4512 
Total Latency :21.570231914520264 sec, Throughput:4.636018768656993, Expert Loads saved:5075 
Total Latency :21.6745285987854 sec, Throughput:4.613710491752232, Expert Loads sa

7it [1:59:35, 1030.74s/it]

Total Latency :21.060004711151123 sec, Throughput:4.748337019461857, Expert Loads saved:28158 
Config: 6 Result: [{'latency': 21.529940605163574, 'throughput': 4.6446946526186315, 'expert_loads_saved': 571}, {'latency': 21.462987899780273, 'throughput': 4.6591835427081305, 'expert_loads_saved': 1134}, {'latency': 21.48137402534485, 'throughput': 4.655195700331588, 'expert_loads_saved': 1697}, {'latency': 21.544202089309692, 'throughput': 4.6416200324086425, 'expert_loads_saved': 2260}, {'latency': 21.462239742279053, 'throughput': 4.6593459583347805, 'expert_loads_saved': 2823}, {'latency': 21.573243379592896, 'throughput': 4.63537161475657, 'expert_loads_saved': 3386}, {'latency': 21.555057764053345, 'throughput': 4.639282394629751, 'expert_loads_saved': 3949}, {'latency': 21.45755100250244, 'throughput': 4.660364082943935, 'expert_loads_saved': 4512}, {'latency': 21.570231914520264, 'throughput': 4.636018768656993, 'expert_loads_saved': 5075}, {'latency': 21.6745285987854, 'throughpu

Loading experts:   0%|          | 0/32 [00:00<?, ?it/s]

Batch: ['How can one improve their time management skills?'] | Input_ids: torch.Size([1, 10])
Total Latency :23.70490336418152 sec, Throughput:4.218536497014435, Expert Loads saved:542 
Total Latency :23.280213832855225 sec, Throughput:4.295493190825876, Expert Loads saved:1105 
Total Latency :23.27651619911194 sec, Throughput:4.296175559288175, Expert Loads saved:1670 
Total Latency :23.27291512489319 sec, Throughput:4.296840316881401, Expert Loads saved:2235 
Total Latency :23.33835196495056 sec, Throughput:4.284792694453301, Expert Loads saved:2800 
Total Latency :23.215132474899292 sec, Throughput:4.307535186720221, Expert Loads saved:3365 
Total Latency :23.30180287361145 sec, Throughput:4.291513431059312, Expert Loads saved:3930 
Total Latency :23.378507137298584 sec, Throughput:4.277433088978458, Expert Loads saved:4495 
Total Latency :23.263429164886475 sec, Throughput:4.298592408334139, Expert Loads saved:5060 
Total Latency :23.313727617263794 sec, Throughput:4.28931836391320

8it [2:19:23, 1080.98s/it]

Total Latency :23.670578241348267 sec, Throughput:4.224653871163903, Expert Loads saved:28225 
Config: 7 Result: [{'latency': 23.70490336418152, 'throughput': 4.218536497014435, 'expert_loads_saved': 542}, {'latency': 23.280213832855225, 'throughput': 4.295493190825876, 'expert_loads_saved': 1105}, {'latency': 23.27651619911194, 'throughput': 4.296175559288175, 'expert_loads_saved': 1670}, {'latency': 23.27291512489319, 'throughput': 4.296840316881401, 'expert_loads_saved': 2235}, {'latency': 23.33835196495056, 'throughput': 4.284792694453301, 'expert_loads_saved': 2800}, {'latency': 23.215132474899292, 'throughput': 4.307535186720221, 'expert_loads_saved': 3365}, {'latency': 23.30180287361145, 'throughput': 4.291513431059312, 'expert_loads_saved': 3930}, {'latency': 23.378507137298584, 'throughput': 4.277433088978458, 'expert_loads_saved': 4495}, {'latency': 23.263429164886475, 'throughput': 4.298592408334139, 'expert_loads_saved': 5060}, {'latency': 23.313727617263794, 'throughput': 

Loading experts:   0%|          | 0/32 [00:00<?, ?it/s]

Batch: ['What are the health benefits of daily exercise?'] | Input_ids: torch.Size([1, 10])
Total Latency :16.306580305099487 sec, Throughput:6.132493639314886, Expert Loads saved:791 
Total Latency :16.371798753738403 sec, Throughput:6.10806433087663, Expert Loads saved:1664 
Total Latency :16.290611505508423 sec, Throughput:6.138504988973957, Expert Loads saved:2464 
Total Latency :16.368008136749268 sec, Throughput:6.10947887883078, Expert Loads saved:3319 
Total Latency :16.245721101760864 sec, Throughput:6.155466991807526, Expert Loads saved:4151 
Total Latency :16.206371068954468 sec, Throughput:6.17041283175132, Expert Loads saved:4956 
Total Latency :16.13752245903015 sec, Throughput:6.1967380837969035, Expert Loads saved:5794 
Total Latency :16.466004610061646 sec, Throughput:6.073118668926792, Expert Loads saved:6613 
Total Latency :16.3336443901062 sec, Throughput:6.122332384104868, Expert Loads saved:7483 
Total Latency :16.222235202789307 sec, Throughput:6.164378629081007,

9it [2:33:21, 1004.86s/it]

Total Latency :16.27250075340271 sec, Throughput:6.145336940856445, Expert Loads saved:42454 
Config: 8 Result: [{'latency': 16.306580305099487, 'throughput': 6.132493639314886, 'expert_loads_saved': 791}, {'latency': 16.371798753738403, 'throughput': 6.10806433087663, 'expert_loads_saved': 1664}, {'latency': 16.290611505508423, 'throughput': 6.138504988973957, 'expert_loads_saved': 2464}, {'latency': 16.368008136749268, 'throughput': 6.10947887883078, 'expert_loads_saved': 3319}, {'latency': 16.245721101760864, 'throughput': 6.155466991807526, 'expert_loads_saved': 4151}, {'latency': 16.206371068954468, 'throughput': 6.17041283175132, 'expert_loads_saved': 4956}, {'latency': 16.13752245903015, 'throughput': 6.1967380837969035, 'expert_loads_saved': 5794}, {'latency': 16.466004610061646, 'throughput': 6.073118668926792, 'expert_loads_saved': 6613}, {'latency': 16.3336443901062, 'throughput': 6.122332384104868, 'expert_loads_saved': 7483}, {'latency': 16.222235202789307, 'throughput': 6

Loading experts:   0%|          | 0/32 [00:00<?, ?it/s]

Batch: ['What are the top tourist attractions in Paris?'] | Input_ids: torch.Size([1, 11])
Total Latency :16.98691177368164 sec, Throughput:5.886885228598947, Expert Loads saved:1024 
Total Latency :16.70069909095764 sec, Throughput:5.9877732935229995, Expert Loads saved:2082 
Total Latency :16.77380061149597 sec, Throughput:5.961678114348439, Expert Loads saved:3098 
Total Latency :16.69525456428528 sec, Throughput:5.989725979615872, Expert Loads saved:4142 
Total Latency :16.596738576889038 sec, Throughput:6.025280180001752, Expert Loads saved:5186 
Total Latency :16.832696437835693 sec, Throughput:5.940818832520796, Expert Loads saved:6235 
Total Latency :16.69973850250244 sec, Throughput:5.988117717233422, Expert Loads saved:7284 
Total Latency :16.736199855804443 sec, Throughput:5.975072051097551, Expert Loads saved:8333 
Total Latency :16.720372915267944 sec, Throughput:5.98072785258794, Expert Loads saved:9382 
Total Latency :16.69246554374695 sec, Throughput:5.990726758603993, 

10it [2:47:46, 961.63s/it]

Total Latency :16.88895845413208 sec, Throughput:5.9210282428952175, Expert Loads saved:52391 
Config: 9 Result: [{'latency': 16.98691177368164, 'throughput': 5.886885228598947, 'expert_loads_saved': 1024}, {'latency': 16.70069909095764, 'throughput': 5.9877732935229995, 'expert_loads_saved': 2082}, {'latency': 16.77380061149597, 'throughput': 5.961678114348439, 'expert_loads_saved': 3098}, {'latency': 16.69525456428528, 'throughput': 5.989725979615872, 'expert_loads_saved': 4142}, {'latency': 16.596738576889038, 'throughput': 6.025280180001752, 'expert_loads_saved': 5186}, {'latency': 16.832696437835693, 'throughput': 5.940818832520796, 'expert_loads_saved': 6235}, {'latency': 16.69973850250244, 'throughput': 5.988117717233422, 'expert_loads_saved': 7284}, {'latency': 16.736199855804443, 'throughput': 5.975072051097551, 'expert_loads_saved': 8333}, {'latency': 16.720372915267944, 'throughput': 5.98072785258794, 'expert_loads_saved': 9382}, {'latency': 16.69246554374695, 'throughput': 

Loading experts:   0%|          | 0/32 [00:00<?, ?it/s]

Batch: ['How can one improve their time management skills?'] | Input_ids: torch.Size([1, 10])
Total Latency :19.24893546104431 sec, Throughput:5.1950924871860265, Expert Loads saved:1407 
Total Latency :19.285730361938477 sec, Throughput:5.185180862911777, Expert Loads saved:2795 
Total Latency :19.11365008354187 sec, Throughput:5.231863069739185, Expert Loads saved:4193 
Total Latency :19.3957736492157 sec, Throughput:5.155762374245055, Expert Loads saved:5587 
Total Latency :18.97361660003662 sec, Throughput:5.27047647836454, Expert Loads saved:7009 
Total Latency :19.146771669387817 sec, Throughput:5.222812583067552, Expert Loads saved:8397 
Total Latency :19.31075620651245 sec, Throughput:5.178461108958309, Expert Loads saved:9793 
Total Latency :18.82799530029297 sec, Throughput:5.311239906589735, Expert Loads saved:11203 
Total Latency :19.246833562850952 sec, Throughput:5.195659830145454, Expert Loads saved:12619 
Total Latency :19.323692560195923 sec, Throughput:5.1749943593071

11it [3:04:02, 966.01s/it]

Total Latency :19.280458688735962 sec, Throughput:5.186598597803176, Expert Loads saved:70107 
Config: 10 Result: [{'latency': 19.24893546104431, 'throughput': 5.1950924871860265, 'expert_loads_saved': 1407}, {'latency': 19.285730361938477, 'throughput': 5.185180862911777, 'expert_loads_saved': 2795}, {'latency': 19.11365008354187, 'throughput': 5.231863069739185, 'expert_loads_saved': 4193}, {'latency': 19.3957736492157, 'throughput': 5.155762374245055, 'expert_loads_saved': 5587}, {'latency': 18.97361660003662, 'throughput': 5.27047647836454, 'expert_loads_saved': 7009}, {'latency': 19.146771669387817, 'throughput': 5.222812583067552, 'expert_loads_saved': 8397}, {'latency': 19.31075620651245, 'throughput': 5.178461108958309, 'expert_loads_saved': 9793}, {'latency': 18.82799530029297, 'throughput': 5.311239906589735, 'expert_loads_saved': 11203}, {'latency': 19.246833562850952, 'throughput': 5.195659830145454, 'expert_loads_saved': 12619}, {'latency': 19.323692560195923, 'throughput'

Loading experts:   0%|          | 0/32 [00:00<?, ?it/s]

Batch: ['What are the main causes of climate change?'] | Input_ids: torch.Size([1, 10])
Total Latency :20.369444370269775 sec, Throughput:4.909314077607095, Expert Loads saved:1558 
Total Latency :20.336856365203857 sec, Throughput:4.917180817144331, Expert Loads saved:3109 
Total Latency :20.459956169128418 sec, Throughput:4.8875960033036545, Expert Loads saved:4639 
Total Latency :20.391586780548096 sec, Throughput:4.903983249375758, Expert Loads saved:6191 
Total Latency :20.333828449249268 sec, Throughput:4.917913035884398, Expert Loads saved:7721 
Total Latency :20.388686180114746 sec, Throughput:4.904680915513371, Expert Loads saved:9273 
Total Latency :20.513322830200195 sec, Throughput:4.87488062405851, Expert Loads saved:10803 
Total Latency :20.360116004943848 sec, Throughput:4.911563371039634, Expert Loads saved:12355 
Total Latency :20.445189476013184 sec, Throughput:4.8911261065749745, Expert Loads saved:13885 
Total Latency :20.358350038528442 sec, Throughput:4.9119894201

12it [3:21:23, 988.87s/it]

Total Latency :20.4180588722229 sec, Throughput:4.897625216275668, Expert Loads saved:77077 
Config: 11 Result: [{'latency': 20.369444370269775, 'throughput': 4.909314077607095, 'expert_loads_saved': 1558}, {'latency': 20.336856365203857, 'throughput': 4.917180817144331, 'expert_loads_saved': 3109}, {'latency': 20.459956169128418, 'throughput': 4.8875960033036545, 'expert_loads_saved': 4639}, {'latency': 20.391586780548096, 'throughput': 4.903983249375758, 'expert_loads_saved': 6191}, {'latency': 20.333828449249268, 'throughput': 4.917913035884398, 'expert_loads_saved': 7721}, {'latency': 20.388686180114746, 'throughput': 4.904680915513371, 'expert_loads_saved': 9273}, {'latency': 20.513322830200195, 'throughput': 4.87488062405851, 'expert_loads_saved': 10803}, {'latency': 20.360116004943848, 'throughput': 4.911563371039634, 'expert_loads_saved': 12355}, {'latency': 20.445189476013184, 'throughput': 4.8911261065749745, 'expert_loads_saved': 13885}, {'latency': 20.358350038528442, 'thro

Loading experts:   0%|          | 0/32 [00:00<?, ?it/s]

Batch: ['What are the main causes of climate change?'] | Input_ids: torch.Size([1, 10])
Total Latency :15.702238082885742 sec, Throughput:6.368518899798906, Expert Loads saved:1097 
Total Latency :15.672339916229248 sec, Throughput:6.380668141101672, Expert Loads saved:2242 
Total Latency :15.60160517692566 sec, Throughput:6.409596888652023, Expert Loads saved:3354 
Total Latency :15.930583000183105 sec, Throughput:6.277234172713617, Expert Loads saved:4457 
Total Latency :15.83162808418274 sec, Throughput:6.316469757138196, Expert Loads saved:5561 
Total Latency :15.796133995056152 sec, Throughput:6.330662935076256, Expert Loads saved:6665 
Total Latency :16.01514220237732 sec, Throughput:6.244090669713554, Expert Loads saved:7790 
Total Latency :15.97008466720581 sec, Throughput:6.261707566607185, Expert Loads saved:8884 
Total Latency :15.865101099014282 sec, Throughput:6.303142940968281, Expert Loads saved:10050 
Total Latency :15.913127899169922 sec, Throughput:6.284119667335566, 

13it [3:34:53, 934.83s/it]

Total Latency :15.439211130142212 sec, Throughput:6.47701486540128, Expert Loads saved:55163 
Config: 12 Result: [{'latency': 15.702238082885742, 'throughput': 6.368518899798906, 'expert_loads_saved': 1097}, {'latency': 15.672339916229248, 'throughput': 6.380668141101672, 'expert_loads_saved': 2242}, {'latency': 15.60160517692566, 'throughput': 6.409596888652023, 'expert_loads_saved': 3354}, {'latency': 15.930583000183105, 'throughput': 6.277234172713617, 'expert_loads_saved': 4457}, {'latency': 15.83162808418274, 'throughput': 6.316469757138196, 'expert_loads_saved': 5561}, {'latency': 15.796133995056152, 'throughput': 6.330662935076256, 'expert_loads_saved': 6665}, {'latency': 16.01514220237732, 'throughput': 6.244090669713554, 'expert_loads_saved': 7790}, {'latency': 15.97008466720581, 'throughput': 6.261707566607185, 'expert_loads_saved': 8884}, {'latency': 15.865101099014282, 'throughput': 6.303142940968281, 'expert_loads_saved': 10050}, {'latency': 15.913127899169922, 'throughput

Loading experts:   0%|          | 0/32 [00:00<?, ?it/s]

Batch: ['What is the process of photosynthesis in plants?'] | Input_ids: torch.Size([1, 13])
Total Latency :16.370599508285522 sec, Throughput:6.108511783541451, Expert Loads saved:1543 
Total Latency :16.39268159866333 sec, Throughput:6.10028319028377, Expert Loads saved:3053 
Total Latency :16.447457790374756 sec, Throughput:6.079966963558415, Expert Loads saved:4594 
Total Latency :16.510268688201904 sec, Throughput:6.056836620197413, Expert Loads saved:6153 
Total Latency :16.466883420944214 sec, Throughput:6.072794556425297, Expert Loads saved:7672 
Total Latency :16.22523021697998 sec, Throughput:6.163240746830716, Expert Loads saved:9194 
Total Latency :16.185531854629517 sec, Throughput:6.178357368676593, Expert Loads saved:10729 
Total Latency :16.59160089492798 sec, Throughput:6.027145941689678, Expert Loads saved:12305 
Total Latency :16.524434566497803 sec, Throughput:6.051644284564107, Expert Loads saved:13820 
Total Latency :16.401602506637573 sec, Throughput:6.0969652178

14it [3:49:10, 911.23s/it]

Total Latency :16.659314393997192 sec, Throughput:6.002647986284042, Expert Loads saved:76840 
Config: 13 Result: [{'latency': 16.370599508285522, 'throughput': 6.108511783541451, 'expert_loads_saved': 1543}, {'latency': 16.39268159866333, 'throughput': 6.10028319028377, 'expert_loads_saved': 3053}, {'latency': 16.447457790374756, 'throughput': 6.079966963558415, 'expert_loads_saved': 4594}, {'latency': 16.510268688201904, 'throughput': 6.056836620197413, 'expert_loads_saved': 6153}, {'latency': 16.466883420944214, 'throughput': 6.072794556425297, 'expert_loads_saved': 7672}, {'latency': 16.22523021697998, 'throughput': 6.163240746830716, 'expert_loads_saved': 9194}, {'latency': 16.185531854629517, 'throughput': 6.178357368676593, 'expert_loads_saved': 10729}, {'latency': 16.59160089492798, 'throughput': 6.027145941689678, 'expert_loads_saved': 12305}, {'latency': 16.524434566497803, 'throughput': 6.051644284564107, 'expert_loads_saved': 13820}, {'latency': 16.401602506637573, 'through

Loading experts:   0%|          | 0/32 [00:00<?, ?it/s]

Batch: ['How can one improve their time management skills?'] | Input_ids: torch.Size([1, 10])
Total Latency :18.099811553955078 sec, Throughput:5.524919400508814, Expert Loads saved:1942 
Total Latency :18.006612062454224 sec, Throughput:5.553515544909809, Expert Loads saved:3875 
Total Latency :18.207762718200684 sec, Throughput:5.492162960803464, Expert Loads saved:5848 
Total Latency :18.24242329597473 sec, Throughput:5.481727859152651, Expert Loads saved:7803 
Total Latency :17.691598892211914 sec, Throughput:5.652400362978011, Expert Loads saved:9754 
Total Latency :17.951806783676147 sec, Throughput:5.570469936816139, Expert Loads saved:11655 
Total Latency :18.005659818649292 sec, Throughput:5.553809247047164, Expert Loads saved:13588 
Total Latency :17.854589700698853 sec, Throughput:5.6008007843543925, Expert Loads saved:15569 
Total Latency :17.808734893798828 sec, Throughput:5.615222001806594, Expert Loads saved:17532 
Total Latency :18.073002099990845 sec, Throughput:5.5331

15it [4:04:25, 912.38s/it]

Total Latency :18.025336027145386 sec, Throughput:5.547746785380548, Expert Loads saved:97538 
Config: 14 Result: [{'latency': 18.099811553955078, 'throughput': 5.524919400508814, 'expert_loads_saved': 1942}, {'latency': 18.006612062454224, 'throughput': 5.553515544909809, 'expert_loads_saved': 3875}, {'latency': 18.207762718200684, 'throughput': 5.492162960803464, 'expert_loads_saved': 5848}, {'latency': 18.24242329597473, 'throughput': 5.481727859152651, 'expert_loads_saved': 7803}, {'latency': 17.691598892211914, 'throughput': 5.652400362978011, 'expert_loads_saved': 9754}, {'latency': 17.951806783676147, 'throughput': 5.570469936816139, 'expert_loads_saved': 11655}, {'latency': 18.005659818649292, 'throughput': 5.553809247047164, 'expert_loads_saved': 13588}, {'latency': 17.854589700698853, 'throughput': 5.6008007843543925, 'expert_loads_saved': 15569}, {'latency': 17.808734893798828, 'throughput': 5.615222001806594, 'expert_loads_saved': 17532}, {'latency': 18.073002099990845, 'th

Loading experts:   0%|          | 0/32 [00:00<?, ?it/s]

Batch: ['How can one improve their time management skills?'] | Input_ids: torch.Size([1, 10])
Total Latency :19.481911182403564 sec, Throughput:5.13296663062102, Expert Loads saved:2264 
Total Latency :19.479918718338013 sec, Throughput:5.133491645725501, Expert Loads saved:4526 
Total Latency :19.44551706314087 sec, Throughput:5.142573461805795, Expert Loads saved:6771 
Total Latency :19.337899923324585 sec, Throughput:5.17119234231759, Expert Loads saved:9016 
Total Latency :19.45466899871826 sec, Throughput:5.140154273844923, Expert Loads saved:11261 
Total Latency :19.457120895385742 sec, Throughput:5.139506535302199, Expert Loads saved:13506 
Total Latency :19.401081562042236 sec, Throughput:5.154351816944457, Expert Loads saved:15751 
Total Latency :19.439672470092773 sec, Throughput:5.144119591204345, Expert Loads saved:17996 
Total Latency :19.313832759857178 sec, Throughput:5.17763621769807, Expert Loads saved:20241 
Total Latency :19.535656452178955 sec, Throughput:5.11884513

16it [4:20:51, 934.64s/it]

Total Latency :19.437612056732178 sec, Throughput:5.144664874889568, Expert Loads saved:112286 
Config: 15 Result: [{'latency': 19.481911182403564, 'throughput': 5.13296663062102, 'expert_loads_saved': 2264}, {'latency': 19.479918718338013, 'throughput': 5.133491645725501, 'expert_loads_saved': 4526}, {'latency': 19.44551706314087, 'throughput': 5.142573461805795, 'expert_loads_saved': 6771}, {'latency': 19.337899923324585, 'throughput': 5.17119234231759, 'expert_loads_saved': 9016}, {'latency': 19.45466899871826, 'throughput': 5.140154273844923, 'expert_loads_saved': 11261}, {'latency': 19.457120895385742, 'throughput': 5.139506535302199, 'expert_loads_saved': 13506}, {'latency': 19.401081562042236, 'throughput': 5.154351816944457, 'expert_loads_saved': 15751}, {'latency': 19.439672470092773, 'throughput': 5.144119591204345, 'expert_loads_saved': 17996}, {'latency': 19.313832759857178, 'throughput': 5.17763621769807, 'expert_loads_saved': 20241}, {'latency': 19.535656452178955, 'throu

Loading experts:   0%|          | 0/32 [00:00<?, ?it/s]

Batch: ['Can you explain the theory of relativity in simple terms?'] | Input_ids: torch.Size([1, 13])
Total Latency :16.492210865020752 sec, Throughput:6.063468434792789, Expert Loads saved:776 
Total Latency :16.561957359313965 sec, Throughput:6.037933671152879, Expert Loads saved:1588 
Total Latency :16.555693864822388 sec, Throughput:6.040217994878514, Expert Loads saved:2397 
Total Latency :16.560113191604614 sec, Throughput:6.038606067662413, Expert Loads saved:3149 
Total Latency :16.514193296432495 sec, Throughput:6.055397209235928, Expert Loads saved:3926 
Total Latency :16.529194116592407 sec, Throughput:6.049901725070647, Expert Loads saved:4741 
Total Latency :16.6395206451416 sec, Throughput:6.009788510896674, Expert Loads saved:5536 
Total Latency :16.467782497406006 sec, Throughput:6.072463005614261, Expert Loads saved:6357 
Total Latency :16.473708629608154 sec, Throughput:6.070278541910731, Expert Loads saved:7166 
Total Latency :16.646726608276367 sec, Throughput:6.007

17it [4:35:06, 910.57s/it]

Total Latency :16.89674711227417 sec, Throughput:5.918298908986913, Expert Loads saved:39776 
Config: 16 Result: [{'latency': 16.492210865020752, 'throughput': 6.063468434792789, 'expert_loads_saved': 776}, {'latency': 16.561957359313965, 'throughput': 6.037933671152879, 'expert_loads_saved': 1588}, {'latency': 16.555693864822388, 'throughput': 6.040217994878514, 'expert_loads_saved': 2397}, {'latency': 16.560113191604614, 'throughput': 6.038606067662413, 'expert_loads_saved': 3149}, {'latency': 16.514193296432495, 'throughput': 6.055397209235928, 'expert_loads_saved': 3926}, {'latency': 16.529194116592407, 'throughput': 6.049901725070647, 'expert_loads_saved': 4741}, {'latency': 16.6395206451416, 'throughput': 6.009788510896674, 'expert_loads_saved': 5536}, {'latency': 16.467782497406006, 'throughput': 6.072463005614261, 'expert_loads_saved': 6357}, {'latency': 16.473708629608154, 'throughput': 6.070278541910731, 'expert_loads_saved': 7166}, {'latency': 16.646726608276367, 'throughput

Loading experts:   0%|          | 0/32 [00:00<?, ?it/s]

Batch: ['What are the health benefits of daily exercise?'] | Input_ids: torch.Size([1, 10])
Total Latency :17.502889156341553 sec, Throughput:5.713342472020885, Expert Loads saved:897 
Total Latency :17.369927644729614 sec, Throughput:5.75707637045581, Expert Loads saved:1824 
Total Latency :17.93077516555786 sec, Throughput:5.577003731109402, Expert Loads saved:2789 
Total Latency :17.599050521850586 sec, Throughput:5.68212471893539, Expert Loads saved:3653 
Total Latency :17.71922755241394 sec, Throughput:5.643586872181498, Expert Loads saved:4567 
Total Latency :17.697823524475098 sec, Throughput:5.650412315486456, Expert Loads saved:5512 
Total Latency :17.614876747131348 sec, Throughput:5.677019569057467, Expert Loads saved:6436 
Total Latency :17.459588527679443 sec, Throughput:5.727511839208906, Expert Loads saved:7336 
Total Latency :17.663245916366577 sec, Throughput:5.661473574760178, Expert Loads saved:8215 
Total Latency :17.56931471824646 sec, Throughput:5.691741630431713,

18it [4:50:07, 907.63s/it]

Total Latency :17.730583667755127 sec, Throughput:5.639972257758226, Expert Loads saved:46810 
Config: 17 Result: [{'latency': 17.502889156341553, 'throughput': 5.713342472020885, 'expert_loads_saved': 897}, {'latency': 17.369927644729614, 'throughput': 5.75707637045581, 'expert_loads_saved': 1824}, {'latency': 17.93077516555786, 'throughput': 5.577003731109402, 'expert_loads_saved': 2789}, {'latency': 17.599050521850586, 'throughput': 5.68212471893539, 'expert_loads_saved': 3653}, {'latency': 17.71922755241394, 'throughput': 5.643586872181498, 'expert_loads_saved': 4567}, {'latency': 17.697823524475098, 'throughput': 5.650412315486456, 'expert_loads_saved': 5512}, {'latency': 17.614876747131348, 'throughput': 5.677019569057467, 'expert_loads_saved': 6436}, {'latency': 17.459588527679443, 'throughput': 5.727511839208906, 'expert_loads_saved': 7336}, {'latency': 17.663245916366577, 'throughput': 5.661473574760178, 'expert_loads_saved': 8215}, {'latency': 17.56931471824646, 'throughput':

Loading experts:   0%|          | 0/32 [00:00<?, ?it/s]

Batch: ['What are the main causes of climate change?'] | Input_ids: torch.Size([1, 10])
Total Latency :20.062532663345337 sec, Throughput:4.984415560987575, Expert Loads saved:1087 
Total Latency :19.76108431816101 sec, Throughput:5.060451055719503, Expert Loads saved:2153 
Total Latency :19.920937299728394 sec, Throughput:5.019844121559653, Expert Loads saved:3247 
Total Latency :19.56750988960266 sec, Throughput:5.110512301472541, Expert Loads saved:4314 
Total Latency :19.965257167816162 sec, Throughput:5.0087008226069445, Expert Loads saved:5319 
Total Latency :19.542032957077026 sec, Throughput:5.117174872217459, Expert Loads saved:6343 
Total Latency :19.754781246185303 sec, Throughput:5.062065671788203, Expert Loads saved:7326 
Total Latency :19.01836371421814 sec, Throughput:5.258075905091664, Expert Loads saved:8420 
Total Latency :19.419079065322876 sec, Throughput:5.149574790010122, Expert Loads saved:9533 
Total Latency :19.77348804473877 sec, Throughput:5.057276681470849, 

19it [5:06:50, 936.44s/it]

Total Latency :19.82347822189331 sec, Throughput:5.044523412120416, Expert Loads saved:53680 
Config: 18 Result: [{'latency': 20.062532663345337, 'throughput': 4.984415560987575, 'expert_loads_saved': 1087}, {'latency': 19.76108431816101, 'throughput': 5.060451055719503, 'expert_loads_saved': 2153}, {'latency': 19.920937299728394, 'throughput': 5.019844121559653, 'expert_loads_saved': 3247}, {'latency': 19.56750988960266, 'throughput': 5.110512301472541, 'expert_loads_saved': 4314}, {'latency': 19.965257167816162, 'throughput': 5.0087008226069445, 'expert_loads_saved': 5319}, {'latency': 19.542032957077026, 'throughput': 5.117174872217459, 'expert_loads_saved': 6343}, {'latency': 19.754781246185303, 'throughput': 5.062065671788203, 'expert_loads_saved': 7326}, {'latency': 19.01836371421814, 'throughput': 5.258075905091664, 'expert_loads_saved': 8420}, {'latency': 19.419079065322876, 'throughput': 5.149574790010122, 'expert_loads_saved': 9533}, {'latency': 19.77348804473877, 'throughput

Loading experts:   0%|          | 0/32 [00:00<?, ?it/s]

Batch: ['How can one improve their time management skills?'] | Input_ids: torch.Size([1, 10])
Total Latency :21.873337268829346 sec, Throughput:4.571776074723871, Expert Loads saved:1039 
Total Latency :21.73957848548889 sec, Throughput:4.599905194424525, Expert Loads saved:2070 
Total Latency :21.85190224647522 sec, Throughput:4.576260632692987, Expert Loads saved:3101 
Total Latency :21.84433627128601 sec, Throughput:4.577845660224899, Expert Loads saved:4132 
Total Latency :21.860982179641724 sec, Throughput:4.574359888236224, Expert Loads saved:5163 
Total Latency :22.102371215820312 sec, Throughput:4.524401432929628, Expert Loads saved:6194 
Total Latency :21.810094356536865 sec, Throughput:4.585032891892476, Expert Loads saved:7225 
Total Latency :21.9184091091156 sec, Throughput:4.562374919738642, Expert Loads saved:8256 
Total Latency :21.91471266746521 sec, Throughput:4.563144473642173, Expert Loads saved:9287 
Total Latency :21.94744563102722 sec, Throughput:4.556338887047040

20it [5:25:05, 975.29s/it]

Total Latency :21.674078702926636 sec, Throughput:4.6138062600325, Expert Loads saved:51558 
Config: 19 Result: [{'latency': 21.873337268829346, 'throughput': 4.571776074723871, 'expert_loads_saved': 1039}, {'latency': 21.73957848548889, 'throughput': 4.599905194424525, 'expert_loads_saved': 2070}, {'latency': 21.85190224647522, 'throughput': 4.576260632692987, 'expert_loads_saved': 3101}, {'latency': 21.84433627128601, 'throughput': 4.577845660224899, 'expert_loads_saved': 4132}, {'latency': 21.860982179641724, 'throughput': 4.574359888236224, 'expert_loads_saved': 5163}, {'latency': 22.102371215820312, 'throughput': 4.524401432929628, 'expert_loads_saved': 6194}, {'latency': 21.810094356536865, 'throughput': 4.585032891892476, 'expert_loads_saved': 7225}, {'latency': 21.9184091091156, 'throughput': 4.562374919738642, 'expert_loads_saved': 8256}, {'latency': 21.91471266746521, 'throughput': 4.563144473642173, 'expert_loads_saved': 9287}, {'latency': 21.94744563102722, 'throughput': 4.




## Plotting Performance Gain

In [None]:
import pandas as pd
import numpy as np
import pickle

profiling_results = pickle.load(open("profiling_results.pkl", "rb"))

In [None]:
records = []

for config_num in range(len(run_configs)):
    cfg = run_configs[config_num]
    
    result = profiling_results[config_num]
    
    latencies = []
    loads_saved = []
    throughputs = []
    
    for r in result:
        latencies.append(r['latency'])
        throughputs.append(r['throughput'])
        loads_saved.append(r['expert_loads_saved'])
    
    cfg['config_num'] = config_num
    cfg['mean_latency'] = np.mean(latencies)
    cfg['std_latency'] = np.std(latencies)
    cfg['mean_loads_saved'] = np.mean(loads_saved)
    cfg['std_loads_saved'] = np.std(loads_saved)
    cfg['mean_throughput'] = np.mean(throughputs)
    cfg['std_throughput'] = np.std(throughputs)
    
    records.append(cfg)

In [None]:
df = pd.DataFrame.from_records(records)
df['label'] = df.apply(lambda x: f"{x['routing_strategy']}__{x['alpha']}" if 'THRESHOLDING' == x['routing_strategy'] else x['routing_strategy'], axis = 1)
df

In [None]:
sub_dfs = {}
for offloads in [3, 4, 5, 6]:
    sub_dfs[offloads] = df[df['offload_per_layer'] == offloads]

In [None]:
import matplotlib.pyplot as plt

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))  # 1 row, 3 columns

# plt.style.use('seaborn-darkgrid')
    
# Create a figure and an axis object
#fig, ax = plt.subplots()

# Get unique labels for color coding
labels = df['label'].unique()

# Assign colors to each label (can also use a colormap if preferred)
colors = plt.cm.get_cmap('tab10', len(labels))

# Iterate over each label to plot
for i, label in enumerate(labels):
    # Filter data for the current label
    sub_df = df[df['label'] == label]

    # Plot error bars
    ax1.errorbar(sub_df['offload_per_layer'], sub_df['mean_latency'],
                yerr=sub_df['std_latency'], label=label,
                color=colors(i), elinewidth=3, capsize=3)

# Labeling the axes
ax1.set_xlabel('Offloads per Layer')
ax1.set_ylabel('Latency')

ax1.set_xticks([3, 4, 5, 6])

# Add a title
ax1.set_title('Latency per sequence trends')

# Add legend
ax1.legend(title='Label', loc='upper left')

# Show plot
# plt.show()

# plt.style.use('seaborn-darkgrid')
    
# Create a figure and an axis object
# fig, ax = plt.subplots()

# Get unique labels for color coding
labels = df['label'].unique()

# Assign colors to each label (can also use a colormap if preferred)
colors = plt.cm.get_cmap('tab10', len(labels))

# Iterate over each label to plot
for i, label in enumerate(labels):
    # Filter data for the current label
    sub_df = df[df['label'] == label]

    # Plot error bars
    ax2.errorbar(sub_df['offload_per_layer'], sub_df['mean_throughput'],
                yerr=sub_df['std_throughput'], label=label,
                color=colors(i), elinewidth=3, capsize=3)

# Labeling the axes
ax2.set_xlabel('Offloads per Layer')
ax2.set_ylabel('Throughput')

ax2.set_xticks([3, 4, 5, 6])

# Add a title
ax2.set_title('Generation seconds/token trends')

# Add legend
# ax2.legend(title='Label')

# Show plot
# plt.show()

# plt.style.use('seaborn-darkgrid')
    
# Create a figure and an axis object
# fig, ax = plt.subplots()

# Get unique labels for color coding
labels = df['label'].unique()

# Assign colors to each label (can also use a colormap if preferred)
colors = plt.cm.get_cmap('tab10', len(labels))

# Iterate over each label to plot
for i, label in enumerate(labels):
    # Filter data for the current label
    sub_df = df[df['label'] == label]

    # Plot error bars
    ax3.errorbar(sub_df['offload_per_layer'], sub_df['mean_loads_saved'],
                yerr=sub_df['std_loads_saved'], label=label,
                color=colors(i), elinewidth=0, capsize=0)

# Labeling the axes
ax3.set_xlabel('Offloads per Layer')
ax3.set_ylabel('Expert loads saved')

ax3.set_xticks([3, 4, 5, 6])

# Add a title
ax3.set_title('Avg expert loads saved ')

# Add legend
# ax3.legend(title='Label')

# Show plot
plt.show()