[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhibambhaniya/GenZ-LLM-Analyzer/blob/main/notebook/GenZ_on_colab.ipynb)

In [None]:
## Run only once
!git clone https://github.com/abhibambhaniya/GenZ-LLM-Analyzer.git
!pip install seaborn
!pip install plotnine
!pip install tqdm
!pip install plotly
!pip install paretoset

In [2]:
# Import necessary libraries

import os, sys, warnings
script_dir = os.getcwd()
module_path = script_dir
sys.path.insert(0, '/content/GenZ-LLM-Analyzer')

from GenZ import decode_moddeling, prefill_moddeling
from Systems.system_configs import *
import pandas as pd
from plotnine import *
import plotnine as p9
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.graph_objects as go
import plotly.express as px
from plotnine import *
import plotnine as p9


In [3]:
batch_size_list = [1, 2, 4, 8]
input_token_list = [100, 250, 500]
model_box = ['llama2_7b', 'llama_70b']

data = []
for model in model_box:
  for batch_size in tqdm(batch_size_list):
    for input_token in input_token_list:
                if model == 'llama2_7b':
                    tp, pp = 1, 1
                elif model == 'llama_70b':
                    tp, pp = 2, 1
                prefill_outputs = prefill_moddeling(model = model, batch_size = batch_size,
                                        input_tokens = input_token,
                                        system_name = H100_GPU, system_eff = 1,
                                        bits='bf16',
                                        tensor_parallel = tp, pipeline_parallel = pp,  debug=False)
                TTFT = prefill_outputs['Latency'] / 1000  ## Secs
                decode_outputs = decode_moddeling(model = model, batch_size = batch_size, Bb = 1 ,
                                        input_tokens = input_token, output_tokens = 100,
                                        system_name = H100_GPU, system_eff=1,
                                        bits='bf16',
                                        tensor_parallel = tp, pipeline_parallel = pp, debug=False)
                TPOT = decode_outputs['Latency'] / 1000  ## Secs
                Latency = TTFT + TPOT*100
                data.append([model, tp, pp ,batch_size, input_token, TTFT, TPOT, Latency, decode_outputs['Throughput']])

assert len(data) > 0, "No Model fits in the given # of GPUs. Increase GPUs or use different Model"
data_df = pd.DataFrame(data, columns = ['Model','TP', 'PP', 'Batch', 'Input Size', 'TTFT(sec)', 'TPOT(sec)', 'Latency(sec)', 'Decode Tokens/s'])
data_df

100%|██████████| 4/4 [00:00<00:00, 23.85it/s]
100%|██████████| 4/4 [00:00<00:00, 30.94it/s]


Unnamed: 0,Model,TP,PP,Batch,Input Size,TTFT(sec),TPOT(sec),Latency(sec),Decode Tokens/s,Prefill GEMM Time(ms),Prefill Attn Time(ms),Prefill Communication Time(ms),Decode GEMM Time(ms),Decode Attn Time(ms),Decode Communication Time(ms)
0,llama2_7b,1,1,1,100,0.003701,0.003571,0.360782,280.048635,3.670313,0.030842,0.0,3.549019,0.029052,0.0
1,llama2_7b,1,1,1,250,0.003939,0.003592,0.363177,278.366837,3.854092,0.085052,0.0,3.549019,0.050626,0.0
2,llama2_7b,1,1,1,500,0.006745,0.003628,0.369578,275.608284,6.548034,0.196594,0.0,3.549019,0.086582,0.0
3,llama2_7b,1,1,2,100,0.003855,0.003594,0.363237,556.510504,3.792832,0.061683,0.0,3.550244,0.058104,0.0
4,llama2_7b,1,1,2,250,0.006718,0.003637,0.370415,549.908337,6.548034,0.170103,0.0,3.550244,0.101251,0.0
5,llama2_7b,1,1,2,500,0.013489,0.003709,0.384377,539.246087,13.096067,0.393188,0.0,3.550244,0.173163,0.0
6,llama2_7b,1,1,4,100,0.005362,0.00364,0.369347,1098.946086,5.238427,0.123367,0.0,3.552695,0.116208,0.0
7,llama2_7b,1,1,4,250,0.013436,0.003726,0.386051,1073.495407,13.096067,0.340206,0.0,3.552695,0.202503,0.0
8,llama2_7b,1,1,4,500,0.026979,0.00387,0.413975,1033.599881,26.192135,0.786376,0.0,3.552695,0.346327,0.0
9,llama2_7b,1,1,8,100,0.010724,0.003732,0.383914,2143.675646,10.476854,0.246734,0.0,3.557596,0.232416,0.0


In [1]:
from utils import *
# Set up interactive widgets for the variables
from ipywidgets import interact, IntSlider, Checkbox, BoundedIntText, BoundedFloatText, Dropdown
import ipywidgets as widgets


# Max Batch Size
max_batch_size = widgets.BoundedIntText(
    value=8, min=1, max=128, step=1,
    description='Max Batch Size:',
    disabled=False ,
    layout=widgets.Layout(width='150px'),  # Adjust this value as needed
    style={'description_width': 'initial'}
)

# Custom Usecases
usecases = Dropdown( options=['Ques-Ans', 'Text Summarization', 'Chatbots', 'Code Gen.', 'Custom'], value='Chatbots', description='Usecases:', disabled=False,)

# Beam size
beam_size = widgets.IntSlider(value=2, min=1, max=16, description='# of Parallel Beams:', style={'description_width': 'initial'},)

# Input Tokens
input_tokens = widgets.BoundedIntText(
    value=2048, min=1, max= 100000, step=1,
    description='Input Tokens:',
    disabled=False ,
    layout=widgets.Layout(width='150px'),  # Adjust this value as needed
    style={'description_width': 'initial'}
)

# Output Tokens
output_tokens = widgets.BoundedIntText(
    value=128, min=1, max= 100000, step=1,
    description='Output Tokens:',
    disabled=False ,
    layout=widgets.Layout(width='150px'),  # Adjust this value as needed
    style={'description_width': 'initial'}
)

# Quantization dropdown
quantization = widgets.Dropdown(
    options=['bf16', 'int8', 'int4', 'int2'],
    value='int8',
    description='Quantization:',
    disabled=False ,
    layout=widgets.Layout(width='150px'),  # Adjust this value as needed
    style={'description_width': 'initial'},
    
)
model_box = widgets.SelectMultiple( options=[
    ('meta-llama/Llama-2-7B','llama2_7b'),
    ('meta-llama/Meta-Llama-3-8B','llama3_8b'), 
    ('meta-llama/Llama-2-13B','llama_13b'),
    ('meta-llama/Llama-2-70B','LLaMA_70b'),
    ('google/gemma-2B','gemma_2b'),
    ('google/gemma-7B','gemma_7b'),
    ('google/gemma-2-9B','gemma2_9b'),
    ('google/gemma-2-27B','gemma2_27b'),
    ('mistralai/mistral-7B', 'mistral_7b'),
    ('mistralai/Mixtral-8x7B','mixtral_8x7b'), 
    ('microsoft/phi3mini', 'phi3mini'),
    ('microsoft/phi3small', 'phi3small'),
    ('microsoft/phi3medium', 'phi3medium'),
    ('databricks/dbrx-base','dbrx'),
    ('xai-org/grok-1','grok-1'),
    ('openai/gpt-3','gpt-3'), 
    ('openai/gpt-4','gpt-4'),
    ('facebook/opt-125m','opt_125m'),
    ('facebook/opt-350m','opt_350m'),
    ('facebook/opt-1.3b','opt_1b'),
    ('facebook/opt-175b','opt_175b'),
    ], value=['llama2_7b'], 
    description='Models:', 
    disabled=False,
    layout=widgets.Layout(width='300px', height='150px'))
# System
system = Dropdown( options=['A100_40GB_GPU', 'A100_80GB_GPU', 'H100_GPU','GH200_GPU', 'TPUv4','TPUv5e', 'MI300X', 'Gaudi3', 'Custom'], value='H100_GPU', description='System:', disabled=False,)

# Number of Nodes
nodes = widgets.IntText(
    value=2,
    description='# Nodes:',
    layout=widgets.Layout(width='150px'),  # Adjust this value as needed
    disabled=False
)

# System Efficiency
system_efficiency = widgets.FloatSlider(
    value=0.80,
    min=0,
    max=1.0,
    step=0.01,
    description='System Efficiency:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='.2f',
)


# FLOPS (initially hidden)
flops = widgets.FloatText(value=1000,description='FLOPS(T):',disabled=False, layout=widgets.Layout(width='200px'),)
# MEM BW (initially hidden)
mem_bw = widgets.FloatText(value=3.6,description='MEM BW(TB/s):',disabled=False, layout=widgets.Layout(width='200px'),)
# FLOPS (initially hidden)
mem_cap = widgets.FloatText(value=48,description='FLOPS(GBs):',disabled=False, layout=widgets.Layout(width='200px'),)
# ICN BW (initially hidden)
icn_bw = widgets.FloatText(value=100.0,description='ICN BW(GB/s):',disabled=False, layout=widgets.Layout(width='200px'),)


# Function to show/hide FLOPS and MEM BW
def update_visibility_system_param(change):
    if 'Custom' in change['new']:
        flops.layout.display = ''
        mem_bw.layout.display = ''
        mem_cap.layout.display = ''
        icn_bw.layout.display = ''
    else:
        flops.layout.display = 'none'
        mem_bw.layout.display = 'none'
        mem_cap.layout.display = 'none'
        icn_bw.layout.display = 'none'

# Connect the function to the models widget
system.observe(update_visibility_system_param, names='value')

# Function to show/hide FLOPS and MEM BW
def update_visibility_usecases(change):
    if 'Ques-Ans' in change['new']:
        beam_size.value = 4
        input_tokens.value = 1000
        output_tokens.value = 200
    elif 'Text Summarization' in change['new']:
        beam_size.value = 4
        input_tokens.value = 15000
        output_tokens.value = 1000
    elif 'Chatbots' in change['new']:
        beam_size.value = 2
        input_tokens.value = 2048
        output_tokens.value = 128
    elif 'Code Gen.' in change['new']:
        beam_size.value = 4
        input_tokens.value = 20000
        output_tokens.value = 50

# Connect the function to the models widget
usecases.observe(update_visibility_usecases, names='value')

# # Initially hide custom params
# beam_size.layout.display = 'none'
# input_tokens.layout.display = 'none'
# output_tokens.layout.display = 'none'

flops.layout.display = 'none'
mem_bw.layout.display = 'none'
mem_cap.layout.display = 'none'
icn_bw.layout.display = 'none'

# Layout
left_box = widgets.HBox([quantization, max_batch_size])
input_param_box = widgets.VBox([usecases, beam_size,widgets.HBox([ input_tokens, output_tokens])])
top_box = widgets.VBox([left_box, input_param_box, ])
bottom_box = widgets.HBox([system, nodes, system_efficiency])
system_bottom_box = widgets.HBox([flops, mem_bw, mem_cap, icn_bw])


# Final layout
final_layout = widgets.VBox([widgets.HBox([model_box,top_box]), bottom_box, system_bottom_box], layout=widgets.Layout(justify_content='space-between'))

widgets.interact(generate_demand_curve,
    system_box=system, 
    system_eff=system_efficiency, 
    num_nodes_slider=nodes, 
    model_box=model_box, 
    quantization_box=quantization,
    batch_slider=max_batch_size, 
    input_token_slider=input_tokens, 
    output_token_slider=output_tokens, 
    beam_size = beam_size, 
    flops=flops,
    mem_bw=mem_bw,
    mem_cap=mem_cap,
    icn_bw=icn_bw)

display(final_layout)



interactive(children=(Dropdown(description='System:', index=2, options=('A100_40GB_GPU', 'A100_80GB_GPU', 'H10…

VBox(children=(HBox(children=(SelectMultiple(description='Models:', index=(0,), layout=Layout(height='150px', …