# Day 34: Cost Dashboard - Part 4

Building a comprehensive cost/performance dashboard for LLM serving systems.

## Overview
1. Cost metrics collection
2. Performance tracking
3. Dashboard visualization
4. Optimization recommendations

In [None]:
import time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime, timedelta
from collections import defaultdict, deque

## 1. Cost Metrics Collector

In [None]:
class CostMetricsCollector:
    def __init__(self):
        self.metrics = defaultdict(list)
        self.start_time = time.time()
    
    def record_request(self, tenant_id, tokens_input, tokens_output, 
                      processing_time, cost, cached=False):
        """Record metrics for a request."""
        timestamp = time.time()
        
        metric = {
            'timestamp': timestamp,
            'tenant_id': tenant_id,
            'tokens_input': tokens_input,
            'tokens_output': tokens_output,
            'total_tokens': tokens_input + tokens_output,
            'processing_time': processing_time,
            'cost': cost,
            'cached': cached,
            'tokens_per_second': (tokens_input + tokens_output) / processing_time if processing_time > 0 else 0
        }
        
        self.metrics['requests'].append(metric)
    
    def record_infrastructure(self, gpu_utilization, memory_usage, 
                            active_requests, queue_length):
        """Record infrastructure metrics."""
        timestamp = time.time()
        
        metric = {
            'timestamp': timestamp,
            'gpu_utilization': gpu_utilization,
            'memory_usage': memory_usage,
            'active_requests': active_requests,
            'queue_length': queue_length
        }
        
        self.metrics['infrastructure'].append(metric)
    
    def get_cost_summary(self, time_window_hours=1):
        """Get cost summary for the specified time window."""
        cutoff = time.time() - (time_window_hours * 3600)
        recent_requests = [r for r in self.metrics['requests'] 
                          if r['timestamp'] >= cutoff]
        
        if not recent_requests:
            return {}
        
        total_cost = sum(r['cost'] for r in recent_requests)
        total_tokens = sum(r['total_tokens'] for r in recent_requests)
        total_requests = len(recent_requests)
        cached_requests = sum(1 for r in recent_requests if r['cached'])
        
        avg_processing_time = np.mean([r['processing_time'] for r in recent_requests])
        avg_tokens_per_second = np.mean([r['tokens_per_second'] for r in recent_requests])
        
        return {
            'total_cost': total_cost,
            'total_tokens': total_tokens,
            'total_requests': total_requests,
            'cached_requests': cached_requests,
            'cache_hit_rate': cached_requests / total_requests if total_requests > 0 else 0,
            'avg_cost_per_request': total_cost / total_requests if total_requests > 0 else 0,
            'avg_cost_per_token': total_cost / total_tokens if total_tokens > 0 else 0,
            'avg_processing_time': avg_processing_time,
            'avg_tokens_per_second': avg_tokens_per_second,
            'requests_per_hour': total_requests / time_window_hours
        }

# Initialize collector
collector = CostMetricsCollector()

# Simulate some data
tenants = ['basic_tenant', 'premium_tenant', 'enterprise_tenant']
costs_per_token = {'basic_tenant': 0.002, 'premium_tenant': 0.001, 'enterprise_tenant': 0.0008}

print("Simulating request data...")
for i in range(100):
    tenant = np.random.choice(tenants)
    tokens_input = np.random.randint(50, 300)
    tokens_output = np.random.randint(20, 200)
    processing_time = (tokens_input + tokens_output) * 0.01 + np.random.normal(0, 0.1)
    processing_time = max(0.01, processing_time)
    
    cached = np.random.random() < 0.2  # 20% cache hit rate
    cost = 0 if cached else (tokens_input + tokens_output) * costs_per_token[tenant]
    
    collector.record_request(tenant, tokens_input, tokens_output, 
                           processing_time, cost, cached)
    
    # Simulate infrastructure metrics
    if i % 10 == 0:
        gpu_util = np.random.uniform(60, 95)
        memory_usage = np.random.uniform(70, 90)
        active_reqs = np.random.randint(5, 20)
        queue_len = np.random.randint(0, 10)
        
        collector.record_infrastructure(gpu_util, memory_usage, active_reqs, queue_len)

print("Data simulation complete.")

## 2. Dashboard Visualization

In [None]:
def create_cost_dashboard(collector):
    """Create comprehensive cost dashboard."""
    
    # Get summary metrics
    summary = collector.get_cost_summary()
    
    # Prepare data
    requests_df = pd.DataFrame(collector.metrics['requests'])
    infra_df = pd.DataFrame(collector.metrics['infrastructure'])
    
    # Create dashboard
    fig = plt.figure(figsize=(16, 12))
    
    # 1. Cost over time
    ax1 = plt.subplot(3, 3, 1)
    requests_df['cumulative_cost'] = requests_df['cost'].cumsum()
    ax1.plot(requests_df.index, requests_df['cumulative_cost'])
    ax1.set_title('Cumulative Cost Over Time')
    ax1.set_ylabel('Cost ($)')
    ax1.grid(True, alpha=0.3)
    
    # 2. Cost by tenant
    ax2 = plt.subplot(3, 3, 2)
    cost_by_tenant = requests_df.groupby('tenant_id')['cost'].sum()
    ax2.bar(cost_by_tenant.index, cost_by_tenant.values)
    ax2.set_title('Cost by Tenant')
    ax2.set_ylabel('Total Cost ($)')
    plt.setp(ax2.get_xticklabels(), rotation=45)
    
    # 3. Cache hit rate impact
    ax3 = plt.subplot(3, 3, 3)
    cached_costs = requests_df[requests_df['cached']]['cost'].sum()
    uncached_costs = requests_df[~requests_df['cached']]['cost'].sum()
    ax3.pie([cached_costs, uncached_costs], labels=['Cached', 'Uncached'], autopct='%1.1f%%')
    ax3.set_title('Cost Distribution: Cached vs Uncached')
    
    # 4. Tokens per second distribution
    ax4 = plt.subplot(3, 3, 4)
    ax4.hist(requests_df['tokens_per_second'], bins=20, alpha=0.7)
    ax4.set_title('Tokens per Second Distribution')
    ax4.set_xlabel('Tokens/Second')
    ax4.set_ylabel('Frequency')
    
    # 5. Processing time vs tokens
    ax5 = plt.subplot(3, 3, 5)
    ax5.scatter(requests_df['total_tokens'], requests_df['processing_time'], alpha=0.6)
    ax5.set_title('Processing Time vs Total Tokens')
    ax5.set_xlabel('Total Tokens')
    ax5.set_ylabel('Processing Time (s)')
    
    # 6. GPU utilization over time
    ax6 = plt.subplot(3, 3, 6)
    if not infra_df.empty:
        ax6.plot(infra_df.index, infra_df['gpu_utilization'])
        ax6.set_title('GPU Utilization Over Time')
        ax6.set_ylabel('GPU Utilization (%)')
        ax6.grid(True, alpha=0.3)
    
    # 7. Cost efficiency by tenant
    ax7 = plt.subplot(3, 3, 7)
    efficiency = requests_df.groupby('tenant_id').apply(
        lambda x: x['total_tokens'].sum() / x['cost'].sum() if x['cost'].sum() > 0 else 0
    )
    ax7.bar(efficiency.index, efficiency.values)
    ax7.set_title('Cost Efficiency (Tokens per Dollar)')
    ax7.set_ylabel('Tokens/$')
    plt.setp(ax7.get_xticklabels(), rotation=45)
    
    # 8. Request volume over time (hourly)
    ax8 = plt.subplot(3, 3, 8)
    requests_df['hour'] = (requests_df.index // 10).astype(int)  # Group by 10s for demo
    hourly_requests = requests_df.groupby('hour').size()
    ax8.bar(hourly_requests.index, hourly_requests.values)
    ax8.set_title('Request Volume Over Time')
    ax8.set_ylabel('Requests')
    
    # 9. Key metrics summary
    ax9 = plt.subplot(3, 3, 9)
    ax9.axis('off')
    
    metrics_text = f"""
    KEY METRICS
    
    Total Cost: ${summary.get('total_cost', 0):.3f}
    Total Requests: {summary.get('total_requests', 0)}
    Total Tokens: {summary.get('total_tokens', 0):,}
    
    Cache Hit Rate: {summary.get('cache_hit_rate', 0):.1%}
    Avg Cost/Request: ${summary.get('avg_cost_per_request', 0):.4f}
    Avg Cost/Token: ${summary.get('avg_cost_per_token', 0):.5f}
    
    Avg Processing Time: {summary.get('avg_processing_time', 0):.3f}s
    Avg Tokens/Second: {summary.get('avg_tokens_per_second', 0):.1f}
    Requests/Hour: {summary.get('requests_per_hour', 0):.1f}
    """
    
    ax9.text(0.1, 0.9, metrics_text, transform=ax9.transAxes, 
             fontsize=10, verticalalignment='top', fontfamily='monospace')
    
    plt.tight_layout()
    plt.show()
    
    return summary

# Create dashboard
summary = create_cost_dashboard(collector)
print("\nDashboard created successfully!")

## 3. Optimization Recommendations Engine

In [None]:
class OptimizationEngine:
    def __init__(self, collector):
        self.collector = collector
    
    def analyze_and_recommend(self):
        """Analyze metrics and provide optimization recommendations."""
        recommendations = []
        
        # Get recent data
        summary = self.collector.get_cost_summary()
        requests_df = pd.DataFrame(self.collector.metrics['requests'])
        infra_df = pd.DataFrame(self.collector.metrics['infrastructure'])
        
        # 1. Cache optimization
        cache_hit_rate = summary.get('cache_hit_rate', 0)
        if cache_hit_rate < 0.3:
            potential_savings = summary.get('total_cost', 0) * (0.3 - cache_hit_rate)
            recommendations.append({
                'type': 'Cache Optimization',
                'priority': 'High',
                'description': f'Cache hit rate is {cache_hit_rate:.1%}. Improving to 30% could save ${potential_savings:.3f}',
                'action': 'Implement semantic caching or increase cache size'
            })
        
        # 2. Processing efficiency
        avg_tokens_per_sec = summary.get('avg_tokens_per_second', 0)
        if avg_tokens_per_sec < 50:
            recommendations.append({
                'type': 'Processing Efficiency',
                'priority': 'Medium',
                'description': f'Average throughput is {avg_tokens_per_sec:.1f} tokens/sec, which is below optimal',
                'action': 'Consider model quantization or better batching strategies'
            })
        
        # 3. GPU utilization
        if not infra_df.empty:
            avg_gpu_util = infra_df['gpu_utilization'].mean()
            if avg_gpu_util < 70:
                recommendations.append({
                    'type': 'Resource Utilization',
                    'priority': 'Medium',
                    'description': f'GPU utilization is {avg_gpu_util:.1f}%, indicating underutilization',
                    'action': 'Increase batch sizes or implement auto-scaling'
                })
        
        # 4. Cost per token analysis
        cost_per_token = summary.get('avg_cost_per_token', 0)
        if cost_per_token > 0.002:
            recommendations.append({
                'type': 'Cost Optimization',
                'priority': 'High',
                'description': f'Cost per token is ${cost_per_token:.5f}, which is above industry average',
                'action': 'Review pricing tiers or negotiate better rates with providers'
            })
        
        # 5. Tenant efficiency analysis
        if not requests_df.empty:
            tenant_efficiency = requests_df.groupby('tenant_id').apply(
                lambda x: x['total_tokens'].sum() / x['cost'].sum() if x['cost'].sum() > 0 else float('inf')
            )
            
            inefficient_tenants = tenant_efficiency[tenant_efficiency < 1000].index.tolist()
            if inefficient_tenants:
                recommendations.append({
                    'type': 'Tenant Optimization',
                    'priority': 'Low',
                    'description': f'Tenants {inefficient_tenants} have low cost efficiency',
                    'action': 'Review usage patterns and consider tier adjustments'
                })
        
        return recommendations
    
    def estimate_savings(self, recommendations):
        """Estimate potential cost savings from recommendations."""
        summary = self.collector.get_cost_summary()
        total_cost = summary.get('total_cost', 0)
        
        estimated_savings = 0
        
        for rec in recommendations:
            if rec['type'] == 'Cache Optimization':
                estimated_savings += total_cost * 0.2  # 20% savings from better caching
            elif rec['type'] == 'Processing Efficiency':
                estimated_savings += total_cost * 0.15  # 15% savings from efficiency
            elif rec['type'] == 'Resource Utilization':
                estimated_savings += total_cost * 0.1   # 10% savings from better utilization
            elif rec['type'] == 'Cost Optimization':
                estimated_savings += total_cost * 0.25  # 25% savings from cost optimization
        
        return min(estimated_savings, total_cost * 0.5)  # Cap at 50% savings

# Generate recommendations
optimizer = OptimizationEngine(collector)
recommendations = optimizer.analyze_and_recommend()
estimated_savings = optimizer.estimate_savings(recommendations)

print("\n=== OPTIMIZATION RECOMMENDATIONS ===")
print(f"Potential Monthly Savings: ${estimated_savings * 30:.2f}")
print("\nRecommendations:")

for i, rec in enumerate(recommendations, 1):
    print(f"\n{i}. {rec['type']} (Priority: {rec['priority']})")
    print(f"   Issue: {rec['description']}")
    print(f"   Action: {rec['action']}")

if not recommendations:
    print("\nNo optimization opportunities identified. System is running efficiently!")

## Conclusion

A comprehensive cost dashboard provides:

1. **Real-time Monitoring**: Track costs, performance, and resource utilization
2. **Trend Analysis**: Identify patterns and anomalies in usage
3. **Optimization Insights**: Automated recommendations for cost reduction
4. **ROI Tracking**: Measure impact of optimization efforts

This dashboard enables data-driven decisions for cost optimization in LLM serving systems.