In [1]:
!nvidia-smi

Tue Apr 16 17:49:22 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.161.08             Driver Version: 535.86.10    CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA H100 80GB HBM3          Off | 00000000:BE:00.0 Off |                    0 |
| N/A   29C    P0              73W / 700W |      4MiB / 81559MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
# !wget https://people.eecs.berkeley.edu/~hendrycks/data.tar -O mmlu.tar
# !tar -xf mmlu.tar -C mmlu 

In [3]:
# This will reload the imported modules (e.g. get_decode_model_characterstics) every time you execute the jupyter cells, so that you don't need to restart the notebook after updating the source codes.
%load_ext autoreload
%autoreload 2  

In [4]:

import os, sys
script_dir = os.getcwd()
module_path = script_dir
for _ in range(1):
    module_path = os.path.abspath(os.path.join(module_path, '../'))
    if module_path not in sys.path:
        sys.path.insert(0,module_path)
        
sys.path.append("mixtral-offloading")
import torch
from torch.nn import functional as F
from hqq.core.quantize import BaseQuantizeConfig
from huggingface_hub import snapshot_download
from IPython.display import clear_output
from tqdm.auto import trange
from transformers import AutoConfig, AutoTokenizer
from transformers.utils import logging as hf_logging
import time
import gc
from src.build_model import OffloadConfig, QuantConfig, build_model

[36mhqq_aten package not installed. HQQBackend.ATEN backend will not work unless you install the hqq_aten lib in hqq/kernels.[0m


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
quantized_model_name = "lavawolfiee/Mixtral-8x7B-Instruct-v0.1-offloading-demo"
state_path = "Mixtral-8x7B-Instruct-v0.1-offloading-demo2"

config = AutoConfig.from_pretrained(quantized_model_name)

device = torch.device("cuda:0")

##### Change this to 5 if you have only 12 GB of GPU VRAM #####
offload_per_layer = 6
# offload_per_layer = 5
###############################################################

num_experts = config.num_local_experts

offload_config = OffloadConfig(
    main_size=config.num_hidden_layers * (num_experts - offload_per_layer),
    offload_size=config.num_hidden_layers * offload_per_layer,
    buffer_size=4,
    offload_per_layer=offload_per_layer,
)


attn_config = BaseQuantizeConfig(
    nbits=4,
    group_size=64,
    quant_zero=True,
    quant_scale=True,
)
attn_config["scale_quant_params"]["group_size"] = 256


ffn_config = BaseQuantizeConfig(
    nbits=2,
    group_size=16,
    quant_zero=True,
    quant_scale=True,
)
quant_config = QuantConfig(ffn_config=ffn_config, attn_config=attn_config)


# del model

gc.collect
torch.cuda.empty_cache()


In [6]:

model = build_model(
    device=device,
    quant_config=quant_config,
    offload_config=offload_config,
    state_path=state_path,
    routing_strategy="THRESHOLDING",
    routing_threshold=0.15
)

Loading experts: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00,  6.55it/s]


In [7]:
from mmlu import test_mmlu
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define benchmark with specific tasks and shots
test_mmlu(model_name=model_name, model_loaded=model, tokenizer=tokenizer, data_dir="./mmlu", save_dir="./thresholding_0_15_6E_Off_results")

Starting abstract_algebra, dev size:(5, 6), Test size:(100, 6)
Average accuracy 0.340 , Average Time:1.310 sec, avg expert load reduced: 1132.55, - abstract_algebra
Starting anatomy, dev size:(5, 6), Test size:(135, 6)
Average accuracy 0.593 , Average Time:1.307 sec, avg expert load reduced: 3639.5703703703703, - anatomy
Starting astronomy, dev size:(5, 6), Test size:(152, 6)
Average accuracy 0.789 , Average Time:1.552 sec, avg expert load reduced: 6559.664473684211, - astronomy
Starting business_ethics, dev size:(5, 6), Test size:(100, 6)
Average accuracy 0.670 , Average Time:1.531 sec, avg expert load reduced: 9094.06, - business_ethics
Starting clinical_knowledge, dev size:(5, 6), Test size:(265, 6)
Average accuracy 0.758 , Average Time:1.383 sec, avg expert load reduced: 12676.781132075472, - clinical_knowledge
Starting college_biology, dev size:(5, 6), Test size:(144, 6)
Average accuracy 0.764 , Average Time:1.444 sec, avg expert load reduced: 16781.729166666668, - college_biology

In [None]:
# Result Ref: https://wandb.ai/byyoung3/ml-news/reports/Testing-Mixtral-8x7B-with-MMLU-and-W-B---Vmlldzo2MjI0ODAz

In [6]:

model = build_model(
    device=device,
    quant_config=quant_config,
    offload_config=offload_config,
    state_path=state_path,
    routing_strategy="THRESHOLDING",
    routing_threshold=0.075
)

Loading experts: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00,  6.54it/s]


In [7]:
from mmlu import test_mmlu
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define benchmark with specific tasks and shots
test_mmlu(model_name=model_name, model_loaded=model, tokenizer=tokenizer, data_dir="./mmlu", save_dir="./thresholding_0_075_results")

Starting abstract_algebra, dev size:(5, 6), Test size:(100, 6)
Average accuracy 0.340 , Average Time:1.154 sec, avg expert load reduced: 599.33, - abstract_algebra
Starting anatomy, dev size:(5, 6), Test size:(135, 6)
Average accuracy 0.593 , Average Time:1.153 sec, avg expert load reduced: 1882.8592592592593, - anatomy
Starting astronomy, dev size:(5, 6), Test size:(152, 6)
Average accuracy 0.789 , Average Time:1.394 sec, avg expert load reduced: 3385.125, - astronomy
Starting business_ethics, dev size:(5, 6), Test size:(100, 6)
Average accuracy 0.670 , Average Time:1.376 sec, avg expert load reduced: 4655.49, - business_ethics
Starting clinical_knowledge, dev size:(5, 6), Test size:(265, 6)
Average accuracy 0.758 , Average Time:1.227 sec, avg expert load reduced: 6464.0037735849055, - clinical_knowledge
Starting college_biology, dev size:(5, 6), Test size:(144, 6)
Average accuracy 0.764 , Average Time:1.286 sec, avg expert load reduced: 8545.583333333334, - college_biology
Starting c