In [1]:
from GenZ import get_model_df, get_summary_table, System, create_inference_moe_prefill_layer, create_inference_moe_decode_layer

import os
import pandas as pd

In [2]:
import os
import pandas as pd

def test_dense_LLM_prefill():
    # Delete the current CSV file if it exists
    if os.path.exists('/tmp/current_llama2_7b_prefill_on_TPU.csv'):
        os.remove('/tmp/current_llama2_7b_prefill_on_TPU.csv')

    # Generate the current result
    TPU = System(flops=300, offchip_mem_bw=1200, compute_efficiency=0.8, memory_efficiency=0.8, bits='bf16')

    # Save the current result to a CSV file
    current_df = get_model_df(model=create_inference_moe_prefill_layer(1024, "llama2_7b"), system=TPU)
    current_df.to_csv('/tmp/current_llama2_7b_prefill_on_TPU.csv', index=False)


def test_dense_LLM_decode():
    # Delete the current CSV file if it exists
    if os.path.exists('/tmp/current_llama2_7b_decode_on_TPU.csv'):
        os.remove('/tmp/current_llama2_7b_decode_on_TPU.csv')


    # Generate the current result
    TPU = System(flops=300, offchip_mem_bw=1200, compute_efficiency=0.8, memory_efficiency=0.8, bits='bf16')

    # Save the current result to a CSV file
    current_df = get_model_df(model=create_inference_moe_decode_layer(1024, "llama2_7b"), system=TPU)
    current_df.to_csv('/tmp/current_llama2_7b_decode_on_TPU.csv', index=False)

    # Reload the saved current result
    reloaded_current_df = pd.read_csv('/tmp/current_llama2_7b_decode_on_TPU.csv')


def test_moe_LLM_prefill():
    # Delete the current CSV file if it exists
    if os.path.exists('/tmp/current_mixtral_8x7b_prefill_on_GH200.csv'):
        os.remove('/tmp/current_mixtral_8x7b_prefill_on_GH200.csv')


    # Generate the current result
    GH200 = System(flops=2000, offchip_mem_bw=4900, compute_efficiency=0.8, memory_efficiency=0.8, bits='bf16',
                off_chip_mem_size=144)

    # Save the current result to a CSV file
    current_df = get_model_df(model=create_inference_moe_prefill_layer(1024, "mixtral_8x7b"), system=GH200)
    current_df.to_csv('/tmp/current_mixtral_8x7b_prefill_on_GH200.csv', index=False)



def test_moe_LLM_decode():
    # Delete the current CSV file if it exists
    if os.path.exists('/tmp/current_mixtral_8x7b_decode_on_GH200.csv'):
        os.remove('/tmp/current_mixtral_8x7b_decode_on_GH200.csv')

    # Generate the current result
    GH200 = System(flops=2000, offchip_mem_bw=4900, compute_efficiency=0.8, memory_efficiency=0.8, bits='bf16',
                off_chip_mem_size=144)

    # Save the current result to a CSV file
    current_df = get_model_df(model=create_inference_moe_decode_layer(1024, "mixtral_8x7b"), system=GH200)
    current_df.to_csv('/tmp/current_mixtral_8x7b_decode_on_GH200.csv', index=False)


In [3]:
test_dense_LLM_prefill()
test_dense_LLM_decode()
test_moe_LLM_prefill()
test_moe_LLM_decode()

In [4]:
import numpy as np
np.__version__, pd.__version__

('1.26.4', '2.2.2')

In [5]:
from GenZ.Models.get_language_model import get_configs, create_inference_moe_prefill_layer, create_inference_moe_decode_layer

MODEL_PATH = "/tmp/genz/data/model"

In [6]:
file_name = create_inference_moe_prefill_layer(input_sequence_length=10, name='gpt-2')
assert file_name.endswith('.csv')
assert 'gpt-2_prefix' in file_name
df = pd.read_csv(os.path.join(MODEL_PATH, file_name), header=None)

In [7]:
df

Unnamed: 0,0,1,2,3,4,5,6
0,M,N,D,H,Z,Z,T
1,2304,10,768,1,1,0,3
2,12,10,10,64,12,3,4
3,12,10,10,64,12,1,5
4,768,10,768,1,1,0,3
5,3072,10,768,1,1,0,3
6,768,10,3072,1,1,0,3


In [8]:
from GenZ import prefill_moddeling, get_model_df, get_configs, System, create_inference_moe_prefill_layer, get_AR_time

TPU = System(flops=300, offchip_mem_bw=1200, compute_efficiency=0.8, memory_efficiency=0.8, bits='bf16',
            interchip_mem_bw=50, interchip_link_latency=1)
Model = 'gpt-2'
# Save the current result to a CSV file
current_df = get_model_df(model=create_inference_moe_prefill_layer(4096, Model, tensor_parallel=4), system=TPU)

## For GPT-2, the AR message size is 6 MB (4k tokens * 2 bytes)
AR_time = get_AR_time(data = 6*2**20, num_AR_nodes = 4, system = TPU)

prefill_output = prefill_moddeling(model = Model, batch_size = 1, input_tokens = 4096,
                            system_name = TPU, bits='bf16', tensor_parallel = 4, pipeline_parallel = 1, debug=True)

Unnamed: 0,Op Type,Dimension,Op Intensity,Num ops (MFLOP),Input_a (MB),Input_w (MB),Output (MB),Total Data (MB),Compute time (msec),Memory time (msec),Communication time (msec),Bound,C/M ratio,Cycles,% of total time,Throughput (Tflops),Compute cycle,Memory cycle,Latency (msec),C Effcy,Communication cycle
0,Repeat,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Collective,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.0
1,GEMM,"[((1, 768, 4096), (576, 768), (1, 576, 4096))]",304.661157,3623.878656,6.0,0.84375,4.5,11.34375,0.015099,0.011539,0.0,Compute,1.30851,14193.524736,3.078201,240.0,14193.524736,10847.091675,0.015099,0.8,0.0
2,Logit,"((1, 3, 4096, 64), (1, 3, 4096, 64), (1, 3, 4096, 4096))",62.060606,6442.450944,1.5,1.5,96.0,99.0,0.026844,0.009562,0.0,Compute,2.807264,25232.932864,5.472357,240.0,25232.932864,8988.44401,0.026844,0.8,0.0
3,Attend,"((1, 3, 4096, 4096), (1, 3, 4096, 64), (1, 3, 4096, 64))",62.060606,6442.450944,96.0,1.5,1.5,99.0,0.026844,0.009562,0.0,Compute,2.807264,25232.932864,5.472357,240.0,25232.932864,8988.44401,0.026844,0.8,0.0
4,GEMM,"[((1, 192, 4096), (768, 192), (1, 768, 4096))]",148.048193,1207.959552,1.5,0.28125,6.0,7.78125,0.005033,0.007915,0.0,Memory,0.635862,7440.567017,1.613662,152.606915,4731.174912,7440.567017,0.007915,0.8,0.0
5,Sync,"(1, 4096, 768)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.186781,Collective,0.0,175574.375,38.077444,0.0,0.0,0.0,0.186781,0.8,175574.375
6,GEMM,"[((1, 768, 4096), (768, 768), (1, 768, 4096))]",351.085714,4831.838208,6.0,1.125,6.0,13.125,0.020133,0.013351,0.0,Compute,1.507902,18924.699648,4.104267,240.0,18924.699648,12550.354004,0.020133,0.8,0.0
7,GEMM,"[((1, 768, 4096), (768, 768), (1, 768, 4096))]",351.085714,4831.838208,6.0,1.125,6.0,13.125,0.020133,0.013351,0.0,Compute,1.507902,18924.699648,4.104267,240.0,18924.699648,12550.354004,0.020133,0.8,0.0
8,Sync,"(1, 4096, 768)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.186781,Collective,0.0,175574.375,38.077444,0.0,0.0,0.0,0.186781,0.8,175574.375
9,EndRepeat,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Collective,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.0


Unnamed: 0,MACs (MFLOP),Total Data (MB),Total Weights (MB),Unused Weights (MB),KV Cache (MB),On-chip Memory Footprint (MB),Latency (msec),Cycles,Attn Latency (msec),Linear Latency (msec),Comm Latency (msec)
0,328564.99814,2920.5,40.5,0.0,36.0,99.0,5.88636,5533177.28132,0.64425,0.75936,4.48275


In [9]:
prefill_output

{'Latency': 5.886358809914062,
 'Throughput': 169.88430917866518,
 'Runtime_breakdown': [0.7593637155140625, 0.6442450944, 4.482749999999999],
 'is_offload': False}

In [10]:
get_AR_time(data = 6*2**20, num_AR_nodes = 4, system = TPU)

0.18678124999999998

In [11]:
TPU.interchip_link_latency

1e-06

In [12]:
import numpy as np
np.isclose([5.886358809914062, 169.88430917866518, 0.7593637155140625],
            [5.888638473, 169.8185420255, 0.7616433786,])

array([False, False, False])