In [1]:
from GenZ import get_model_df, get_summary_table, System, create_inference_moe_prefill_layer, create_inference_moe_decode_layer

import os
import pandas as pd

In [2]:
import os
import pandas as pd

def test_dense_LLM_prefill():
    # Delete the current CSV file if it exists
    if os.path.exists('/tmp/current_llama2_7b_prefill_on_TPU.csv'):
        os.remove('/tmp/current_llama2_7b_prefill_on_TPU.csv')

    # Generate the current result
    TPU = System(flops=300, offchip_mem_bw=1200, compute_efficiency=0.8, memory_efficiency=0.8, bits='bf16')

    # Save the current result to a CSV file
    current_df = get_model_df(model=create_inference_moe_prefill_layer(1024, "llama2_7b"), system=TPU)
    current_df.to_csv('/tmp/current_llama2_7b_prefill_on_TPU.csv', index=False)


def test_dense_LLM_decode():
    # Delete the current CSV file if it exists
    if os.path.exists('/tmp/current_llama2_7b_decode_on_TPU.csv'):
        os.remove('/tmp/current_llama2_7b_decode_on_TPU.csv')


    # Generate the current result
    TPU = System(flops=300, offchip_mem_bw=1200, compute_efficiency=0.8, memory_efficiency=0.8, bits='bf16')

    # Save the current result to a CSV file
    current_df = get_model_df(model=create_inference_moe_decode_layer(1024, "llama2_7b"), system=TPU)
    current_df.to_csv('/tmp/current_llama2_7b_decode_on_TPU.csv', index=False)

    # Reload the saved current result
    reloaded_current_df = pd.read_csv('/tmp/current_llama2_7b_decode_on_TPU.csv')


def test_moe_LLM_prefill():
    # Delete the current CSV file if it exists
    if os.path.exists('/tmp/current_mixtral_8x7b_prefill_on_GH200.csv'):
        os.remove('/tmp/current_mixtral_8x7b_prefill_on_GH200.csv')


    # Generate the current result
    GH200 = System(flops=2000, offchip_mem_bw=4900, compute_efficiency=0.8, memory_efficiency=0.8, bits='bf16',
                off_chip_mem_size=144)

    # Save the current result to a CSV file
    current_df = get_model_df(model=create_inference_moe_prefill_layer(1024, "mixtral_8x7b"), system=GH200)
    current_df.to_csv('/tmp/current_mixtral_8x7b_prefill_on_GH200.csv', index=False)



def test_moe_LLM_decode():
    # Delete the current CSV file if it exists
    if os.path.exists('/tmp/current_mixtral_8x7b_decode_on_GH200.csv'):
        os.remove('/tmp/current_mixtral_8x7b_decode_on_GH200.csv')

    # Generate the current result
    GH200 = System(flops=2000, offchip_mem_bw=4900, compute_efficiency=0.8, memory_efficiency=0.8, bits='bf16',
                off_chip_mem_size=144)

    # Save the current result to a CSV file
    current_df = get_model_df(model=create_inference_moe_decode_layer(1024, "mixtral_8x7b"), system=GH200)
    current_df.to_csv('/tmp/current_mixtral_8x7b_decode_on_GH200.csv', index=False)


In [3]:
test_dense_LLM_prefill()
test_dense_LLM_decode()
test_moe_LLM_prefill()
test_moe_LLM_decode()

In [4]:
import numpy as np
np.__version__, pd.__version__

('1.26.4', '2.2.2')

In [5]:
from GenZ.Models.get_language_model import get_configs, create_inference_moe_prefill_layer, create_inference_moe_decode_layer

MODEL_PATH = "/tmp/genz/data/model"

In [6]:
file_name = create_inference_moe_prefill_layer(input_sequence_length=10, name='gpt-2')
assert file_name.endswith('.csv')
assert 'gpt-2_prefix' in file_name
df = pd.read_csv(os.path.join(MODEL_PATH, file_name), header=None)

In [7]:
df

Unnamed: 0,0,1,2,3,4,5,6,7
0,Name,M,N,D,H,Z,Z,T
1,QKV,2304,10,768,1,1,0,3
2,Logit,12,10,10,64,12,3,4
3,Attend,12,10,10,64,12,1,5
4,Out Proj,768,10,768,1,1,0,3
5,up+gate,3072,10,768,1,1,0,3
6,down,768,10,3072,1,1,0,3


In [8]:
from GenZ import prefill_moddeling, get_model_df, get_configs, System, create_inference_moe_prefill_layer, get_AR_time

TPU = System(flops=300, offchip_mem_bw=1200, compute_efficiency=0.8, memory_efficiency=0.8, bits='bf16',
            interchip_link_bw=50, interchip_link_latency=1)
Model = 'gpt-2'
# Save the current result to a CSV file
current_df = get_model_df(model=create_inference_moe_prefill_layer(4096, Model, tensor_parallel=4), system=TPU)

## For GPT-2, the AR message size is 6 MB (4k tokens * 2 bytes)
AR_time = get_AR_time(data = 6*2**20, num_AR_nodes = 4, system = TPU)

prefill_output = prefill_moddeling(model = Model, batch_size = 1, input_tokens = 4096,
                            system_name = TPU, bits='bf16', tensor_parallel = 4, pipeline_parallel = 1, debug=True)

TypeError: get_AR_time() got an unexpected keyword argument 'num_AR_nodes'

In [9]:
prefill_output

{'Latency': 5.886358809914062,
 'Throughput': 169.88430917866518,
 'Runtime_breakdown': [0.7593637155140625, 0.6442450944, 4.482749999999999],
 'is_offload': False}

In [10]:
get_AR_time(data = 6*2**20, num_AR_nodes = 4, system = TPU)

0.18678124999999998

In [11]:
TPU.interchip_link_latency

1e-06

In [12]:
import numpy as np
np.isclose([5.886358809914062, 169.88430917866518, 0.7593637155140625],
            [5.888638473, 169.8185420255, 0.7616433786,])

array([False, False, False])