# 🚀 NIM Performance Analysis

This notebook shows:
- **Concurrency vs RPS, TTFT, and ITL**
- **GPU requirements for target performance**
- **Interactive sliders that actually work**


In [None]:
import glob
import re
import json
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from ipywidgets import interact, IntSlider, Dropdown
import warnings
warnings.filterwarnings('ignore')

print("📚 Libraries loaded successfully!")


In [None]:
# Fixed data loading function for complex GenAI-Perf structure
def load_data():
    files = glob.glob("*genai-perf*/**/*genai_perf.json", recursive=True)
    
    if not files:
        print("❌ No benchmark files found!")
        return pd.DataFrame()
    
    print(f"📁 Found {len(files)} benchmark files")
    
    # More comprehensive regex for the actual path structure
    # genai-perf-artifacts/model-meta-llama-3.1-8b-instruct/profile-tensorrt_llm-trtllm_buildable-bf16-tp4-pp1/2025-09-09_18.27.06/concurrency-1/meta_llama-3.1-8b-instruct-openai-chat-concurrency1/200_200_genai_perf.json
    path_pattern = r'model-([^/]+)/profile-([^/]+)/([^/]+)/concurrency-(\d+)/[^/]+/(\d+)_(\d+)_genai_perf\.json'
    
    results = []
    
    for file in files:
        try:
            print(f"Processing: {file}")
            
            # Extract metadata from path
            path_match = re.search(path_pattern, file)
            if not path_match:
                print(f"   ⚠️ Path doesn't match pattern: {file}")
                continue
                
            model = path_match.group(1)
            profile = path_match.group(2)
            datetime = path_match.group(3)
            concurrency = int(path_match.group(4))
            input_tokens = int(path_match.group(5))
            output_tokens = int(path_match.group(6))
            
            # Load JSON data
            with open(file, 'r') as f:
                data = json.load(f)
            
            # Extract key metrics with proper error handling
            result = {
                'model': model,
                'profile': profile,
                'datetime': datetime,
                'concurrency': concurrency,
                'input_tokens': input_tokens,
                'output_tokens': output_tokens,
                'token_config': f"{input_tokens}→{output_tokens}",
                'rps': data.get('request_throughput', {}).get('avg', 0),
                'ttft_ms': data.get('time_to_first_token', {}).get('avg', 0),
                'itl_ms': data.get('inter_token_latency', {}).get('avg', 0),
                'tokens_per_sec': data.get('output_token_throughput', {}).get('avg', 0),
                'total_latency_ms': data.get('request_latency', {}).get('avg', 0)
            }
            
            print(f"   ✅ Loaded: Concurrency {concurrency}, {input_tokens}→{output_tokens}, RPS {result['rps']:.1f}")
            results.append(result)
            
        except Exception as e:
            print(f"   ❌ Error processing {file}: {e}")
            continue
    
    df = pd.DataFrame(results)
    
    if df.empty:
        print("❌ No valid data loaded")
        return df
    
    # Add derived metrics
    df['gpus_used'] = 4  # From your NIM setup
    df['rps_per_gpu'] = df['rps'] / df['gpus_used']
    
    print(f"\n✅ Successfully loaded {len(df)} benchmark results")
    print(f"📊 Models: {list(df['model'].unique())}")
    print(f"📊 Concurrency levels: {sorted(df['concurrency'].unique())}")
    print(f"🎯 Token configs: {list(df['token_config'].unique())}")
    
    return df

# Load the data
df = load_data()


In [None]:
# Display data summary
if not df.empty:
    print("📈 PERFORMANCE SUMMARY BY CONCURRENCY")
    print("=" * 60)
    
    summary = df.groupby('concurrency').agg({
        'rps': 'mean',
        'ttft_ms': 'mean', 
        'itl_ms': 'mean',
        'tokens_per_sec': 'mean'
    }).round(1)
    
    for concurrency, row in summary.iterrows():
        print(f"Concurrency {concurrency:2d}: {row['rps']:5.1f} RPS | {row['ttft_ms']:6.1f}ms TTFT | {row['itl_ms']:5.1f}ms ITL | {row['tokens_per_sec']:6.1f} tok/s")
    
    print(f"\n💡 Key Insights:")
    best_rps = df.loc[df['rps'].idxmax()]
    best_ttft = df.loc[df['ttft_ms'].idxmin()]
    best_itl = df.loc[df['itl_ms'].idxmin()]
    
    print(f"   • Best RPS: {best_rps['rps']:.1f} at concurrency {best_rps['concurrency']} ({best_rps['token_config']})")
    print(f"   • Best TTFT: {best_ttft['ttft_ms']:.1f}ms at concurrency {best_ttft['concurrency']} ({best_ttft['token_config']})")
    print(f"   • Best ITL: {best_itl['itl_ms']:.1f}ms at concurrency {best_itl['concurrency']} ({best_itl['token_config']})")
else:
    print("❌ No data to analyze")


In [None]:
# Create simple performance charts
if not df.empty:
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Concurrency vs RPS', 'Concurrency vs TTFT', 
                       'Concurrency vs ITL', 'RPS vs Latency Trade-offs')
    )
    
    colors = ['red', 'blue', 'green', 'orange']
    
    for i, token_config in enumerate(df['token_config'].unique()):
        data = df[df['token_config'] == token_config].sort_values('concurrency')
        color = colors[i % len(colors)]
        
        # 1. Concurrency vs RPS
        fig.add_trace(
            go.Scatter(x=data['concurrency'], y=data['rps'], 
                      mode='lines+markers', name=f'{token_config} RPS',
                      line=dict(color=color, width=3), marker=dict(size=8)),
            row=1, col=1
        )
        
        # 2. Concurrency vs TTFT
        fig.add_trace(
            go.Scatter(x=data['concurrency'], y=data['ttft_ms'], 
                      mode='lines+markers', name=f'{token_config} TTFT',
                      line=dict(color=color, dash='dash'), showlegend=False),
            row=1, col=2
        )
        
        # 3. Concurrency vs ITL
        fig.add_trace(
            go.Scatter(x=data['concurrency'], y=data['itl_ms'], 
                      mode='lines+markers', name=f'{token_config} ITL',
                      line=dict(color=color, dash='dot'), showlegend=False),
            row=2, col=1
        )
        
        # 4. RPS vs TTFT with ITL as size
        fig.add_trace(
            go.Scatter(x=data['ttft_ms'], y=data['rps'], 
                      mode='markers', name=f'{token_config} Trade-off',
                      marker=dict(size=data['itl_ms']/10, color=color),
                      text=data['concurrency'], showlegend=False),
            row=2, col=2
        )
    
    fig.update_layout(height=800, title_text="📊 Performance Analysis Dashboard")
    fig.update_xaxes(title_text="Concurrency", row=1, col=1)
    fig.update_yaxes(title_text="RPS", row=1, col=1)
    fig.update_xaxes(title_text="Concurrency", row=1, col=2)
    fig.update_yaxes(title_text="TTFT (ms)", row=1, col=2)
    fig.update_xaxes(title_text="Concurrency", row=2, col=1)
    fig.update_yaxes(title_text="ITL (ms)", row=2, col=1)
    fig.update_xaxes(title_text="TTFT (ms)", row=2, col=2)
    fig.update_yaxes(title_text="RPS", row=2, col=2)
    
    fig.show()
else:
    print("❌ No data to plot")


In [None]:
# Working interactive calculator with model support
def gpu_calculator(target_rps=10, target_concurrency=25, model='All', token_config='All'):
    """GPU requirements calculator with model selection"""
    
    if df.empty:
        print("❌ No data available")
        return
    
    # Filter data by model and token config
    data = df.copy()
    
    if model != 'All':
        data = data[data['model'] == model]
    
    if token_config != 'All':
        data = data[data['token_config'] == token_config]
    
    if data.empty:
        print(f"❌ No data for model: {model}, token config: {token_config}")
        return
    
    # Find closest match
    data = data.copy()
    data['concurrency_diff'] = abs(data['concurrency'] - target_concurrency)
    closest = data.loc[data['concurrency_diff'].idxmin()]
    
    print(f"🎯 ANALYSIS FOR {target_rps} RPS, CONCURRENCY {target_concurrency}")
    print(f"Model: {model} | Token config: {token_config}")
    print("=" * 70)
    
    print(f"📊 CLOSEST BENCHMARK:")
    print(f"   • Model: {closest['model']}")
    print(f"   • Profile: {closest['profile']}")
    print(f"   • Concurrency: {closest['concurrency']}")
    print(f"   • Actual RPS: {closest['rps']:.1f}")
    print(f"   • TTFT: {closest['ttft_ms']:.1f} ms")
    print(f"   • ITL: {closest['itl_ms']:.1f} ms")
    print(f"   • GPUs: {closest['gpus_used']}")
    
    # Calculate scaling
    if closest['rps'] > 0:
        scale_factor = target_rps / closest['rps']
        needed_gpus = max(1, int(np.ceil(closest['gpus_used'] * scale_factor)))
        needed_servers = int(np.ceil(needed_gpus / 8))
        
        # Estimates
        est_rps = closest['rps'] * (needed_gpus / closest['gpus_used'])
        est_ttft = closest['ttft_ms']  # Constant
        est_itl = closest['itl_ms']   # Constant
        
        print(f"\\n🚀 SCALING TO {target_rps} RPS:")
        print(f"   • Required GPUs: {needed_gpus}")
        print(f"   • Required Servers: {needed_servers}")
        print(f"   • Estimated RPS: {est_rps:.1f}")
        print(f"   • Expected TTFT: {est_ttft:.1f} ms")
        print(f"   • Expected ITL: {est_itl:.1f} ms")
        
        # Cost
        monthly_cost = (needed_gpus * 375 + needed_servers * 1250)  # Simplified costs
        print(f"   • Est. Monthly Cost: ${monthly_cost:,.0f}")
        
        # User experience assessment
        print(f"\\n👤 USER EXPERIENCE:")
        if est_ttft < 200:
            print(f"   ✅ TTFT: Good ({est_ttft:.1f}ms)")
        else:
            print(f"   ⚠️ TTFT: Slow ({est_ttft:.1f}ms)")
            
        if est_itl < 50:
            print(f"   ✅ ITL: Smooth streaming ({est_itl:.1f}ms)")
        elif est_itl < 100:
            print(f"   ⚠️ ITL: Acceptable streaming ({est_itl:.1f}ms)")
        else:
            print(f"   ❌ ITL: Choppy streaming ({est_itl:.1f}ms)")

# Create the interactive widget with model selection
if not df.empty:
    model_options = ['All'] + list(df['model'].unique())
    token_options = ['All'] + list(df['token_config'].unique())
    
    print("🎮 INTERACTIVE GPU REQUIREMENTS CALCULATOR")
    print("=" * 60)
    print("💡 Includes ITL analysis for streaming quality!")
    print()
    
    interact(gpu_calculator,
             target_rps=IntSlider(value=10, min=1, max=200, step=1, description='Target RPS:'),
             target_concurrency=IntSlider(value=25, min=1, max=50, step=1, description='Concurrency:'),
             model=Dropdown(options=model_options, value='All', description='Model:'),
             token_config=Dropdown(options=token_options, value='All', description='Token Config:'))
else:
    print("❌ Cannot create calculator - no data loaded")


In [None]:
# Quick reference table
if not df.empty:
    print("📋 QUICK REFERENCE TABLE")
    print("=" * 50)
    
    # Create a simple summary table
    summary_table = df.pivot_table(
        index='concurrency',
        columns='token_config', 
        values=['rps', 'ttft_ms', 'itl_ms'],
        aggfunc='mean'
    ).round(1)
    
    display(summary_table)
    
    print("\n🎯 GPU SCALING REFERENCE (Linear Model):")
    print("Based on best performing configuration:")
    
    best_rps_per_gpu = df['rps_per_gpu'].max()
    
    scaling_examples = [1, 5, 10, 25, 50, 100]
    for target in scaling_examples:
        gpus = max(1, int(np.ceil(target / best_rps_per_gpu)))
        servers = int(np.ceil(gpus / 8))
        print(f"   • {target:3d} RPS → {gpus:2d} GPUs ({servers} servers)")
else:
    print("❌ No data for reference table")


## 🎯 Understanding the Metrics

### 📊 **Key Differences:**
- **Concurrency**: How many requests processed simultaneously
- **RPS**: How many requests completed per second  
- **TTFT**: Time to first token (user wait time)
- **ITL**: Inter-token latency (streaming quality)

### 💡 **Why ITL Matters:**
- **Low ITL (<50ms)**: Smooth, natural streaming like ChatGPT
- **Medium ITL (50-100ms)**: Acceptable but noticeable pauses
- **High ITL (>100ms)**: Choppy, slow feeling responses

### 🎯 **Optimization Strategy:**
1. **For throughput**: Increase concurrency until RPS peaks
2. **For user experience**: Keep TTFT <200ms, ITL <50ms
3. **For scaling**: Add GPUs linearly for higher RPS
4. **Sweet spot**: Balance concurrency for max RPS without bad ITL

---

*Use the interactive calculator above to explore different scenarios!*
