# Day 35: Metrics Collection for LLM Services - Part 2

Implementing comprehensive metrics collection for LLM services to monitor performance, usage, and system health.

## Overview
1. Setting up Prometheus metrics
2. LLM-specific metrics
3. Custom metrics dashboard
4. Alerting thresholds

In [None]:
# Install required packages
!pip install -q prometheus-client psutil matplotlib

In [None]:
import time
import threading
import psutil
import numpy as np
import matplotlib.pyplot as plt
from prometheus_client import Counter, Histogram, Gauge, Summary, start_http_server
from collections import defaultdict, deque

## 1. Setting Up Prometheus Metrics

Define comprehensive metrics for LLM services.

In [None]:
class LLMMetrics:
    """Prometheus metrics for LLM services."""
    
    def __init__(self):
        # Request metrics
        self.requests_total = Counter(
            'llm_requests_total',
            'Total number of LLM requests',
            ['tenant_id', 'model_name', 'status']
        )
        
        self.request_duration = Histogram(
            'llm_request_duration_seconds',
            'Request duration in seconds',
            ['tenant_id', 'model_name'],
            buckets=[0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0]
        )
        
        # Token metrics
        self.tokens_processed = Counter(
            'llm_tokens_processed_total',
            'Total tokens processed',
            ['tenant_id', 'model_name', 'type']  # type: input/output
        )
        
        self.tokens_per_second = Gauge(
            'llm_tokens_per_second',
            'Current tokens per second throughput',
            ['model_name']
        )
        
        # Queue metrics
        self.queue_length = Gauge(
            'llm_queue_length',
            'Current queue length',
            ['model_name']
        )
        
        self.queue_wait_time = Histogram(
            'llm_queue_wait_seconds',
            'Time spent waiting in queue',
            ['model_name']
        )
        
        # Cost metrics
        self.cost_total = Counter(
            'llm_cost_usd_total',
            'Total cost in USD',
            ['tenant_id', 'model_name']
        )
        
        # System metrics
        self.gpu_utilization = Gauge(
            'llm_gpu_utilization_percent',
            'GPU utilization percentage',
            ['gpu_id']
        )
        
        self.memory_usage = Gauge(
            'llm_memory_usage_bytes',
            'Memory usage in bytes',
            ['type']  # type: gpu/cpu
        )
        
        # Error metrics
        self.errors_total = Counter(
            'llm_errors_total',
            'Total number of errors',
            ['tenant_id', 'model_name', 'error_type']
        )

# Initialize metrics
metrics = LLMMetrics()
print("Prometheus metrics initialized")

## 2. Instrumented LLM Service

Create an LLM service with comprehensive metrics collection.

In [None]:
class InstrumentedLLMService:
    """LLM service with comprehensive metrics."""
    
    def __init__(self, model_name="gpt-3.5-turbo"):
        self.model_name = model_name
        self.metrics = metrics
        self.request_queue = deque()
        self.active_requests = 0
        
        # Start system metrics collection
        self._start_system_metrics()
    
    def _start_system_metrics(self):
        """Start background system metrics collection."""
        def collect_system_metrics():
            while True:
                # CPU memory
                memory = psutil.virtual_memory()
                self.metrics.memory_usage.labels(type='cpu').set(memory.used)
                
                # Mock GPU metrics (in production, use nvidia-ml-py)
                gpu_util = np.random.uniform(40, 95)
                gpu_memory = np.random.uniform(2e9, 8e9)  # 2-8GB
                
                self.metrics.gpu_utilization.labels(gpu_id='0').set(gpu_util)
                self.metrics.memory_usage.labels(type='gpu').set(gpu_memory)
                
                # Queue length
                self.metrics.queue_length.labels(model_name=self.model_name).set(len(self.request_queue))
                
                time.sleep(5)  # Update every 5 seconds
        
        thread = threading.Thread(target=collect_system_metrics, daemon=True)
        thread.start()
    
    def generate(self, prompt: str, tenant_id: str, max_tokens: int = 100) -> dict:
        """Generate text with metrics collection."""
        
        # Queue management
        queue_start = time.time()
        self.request_queue.append({'tenant_id': tenant_id, 'start_time': queue_start})
        
        # Simulate queue processing
        time.sleep(0.1)  # Queue wait time
        request_info = self.request_queue.popleft()
        
        queue_wait = time.time() - queue_start
        self.metrics.queue_wait_time.labels(model_name=self.model_name).observe(queue_wait)
        
        # Start request processing
        start_time = time.time()
        self.active_requests += 1
        
        try:
            # Input validation
            if not prompt.strip():
                self.metrics.errors_total.labels(
                    tenant_id=tenant_id,
                    model_name=self.model_name,
                    error_type='validation_error'
                ).inc()
                raise ValueError("Empty prompt")
            
            # Token counting
            input_tokens = len(prompt.split())
            self.metrics.tokens_processed.labels(
                tenant_id=tenant_id,
                model_name=self.model_name,
                type='input'
            ).inc(input_tokens)
            
            # Simulate generation
            generation_time = max_tokens * 0.01
            time.sleep(generation_time)
            
            # Generate response
            output_tokens = min(max_tokens, np.random.randint(10, max_tokens + 1))
            generated_text = f" Generated response with {output_tokens} tokens."
            
            # Record output tokens
            self.metrics.tokens_processed.labels(
                tenant_id=tenant_id,
                model_name=self.model_name,
                type='output'
            ).inc(output_tokens)
            
            # Calculate throughput
            total_time = time.time() - start_time
            throughput = output_tokens / total_time if total_time > 0 else 0
            self.metrics.tokens_per_second.labels(model_name=self.model_name).set(throughput)
            
            # Calculate cost
            total_tokens = input_tokens + output_tokens
            cost = total_tokens * 0.001  # $0.001 per token
            self.metrics.cost_total.labels(
                tenant_id=tenant_id,
                model_name=self.model_name
            ).inc(cost)
            
            # Record successful request
            self.metrics.requests_total.labels(
                tenant_id=tenant_id,
                model_name=self.model_name,
                status='success'
            ).inc()
            
            # Record duration
            duration = time.time() - start_time
            self.metrics.request_duration.labels(
                tenant_id=tenant_id,
                model_name=self.model_name
            ).observe(duration)
            
            return {
                'text': generated_text,
                'usage': {
                    'prompt_tokens': input_tokens,
                    'completion_tokens': output_tokens,
                    'total_tokens': total_tokens
                },
                'cost': cost,
                'duration': duration
            }
            
        except Exception as e:
            # Record error
            self.metrics.requests_total.labels(
                tenant_id=tenant_id,
                model_name=self.model_name,
                status='error'
            ).inc()
            
            self.metrics.errors_total.labels(
                tenant_id=tenant_id,
                model_name=self.model_name,
                error_type=type(e).__name__
            ).inc()
            
            raise
        
        finally:
            self.active_requests -= 1

# Initialize instrumented service
llm_service = InstrumentedLLMService()
print("Instrumented LLM service initialized")

## 3. Testing Metrics Collection

Generate some test requests to collect metrics.

In [None]:
def generate_test_load():
    """Generate test load to collect metrics."""
    
    test_requests = [
        {"prompt": "What is AI?", "tenant": "tenant_1", "tokens": 20},
        {"prompt": "Explain ML", "tenant": "tenant_2", "tokens": 30},
        {"prompt": "Deep learning overview", "tenant": "tenant_1", "tokens": 25},
        {"prompt": "", "tenant": "tenant_3", "tokens": 15},  # Error case
        {"prompt": "Neural networks", "tenant": "tenant_2", "tokens": 40},
    ]
    
    print("Generating test load...")
    
    for i, req in enumerate(test_requests):
        try:
            result = llm_service.generate(
                prompt=req["prompt"],
                tenant_id=req["tenant"],
                max_tokens=req["tokens"]
            )
            print(f"Request {i+1}: Success - {result['usage']['total_tokens']} tokens")
        except Exception as e:
            print(f"Request {i+1}: Error - {e}")
        
        time.sleep(0.5)  # Small delay between requests
    
    print("Test load generation complete")

# Generate test load
generate_test_load()

## 4. Metrics Dashboard

Create a simple metrics dashboard using matplotlib.

In [None]:
def create_metrics_dashboard():
    """Create a simple metrics dashboard."""
    
    # Collect current metric values
    from prometheus_client import REGISTRY
    
    # Get metric samples
    metric_data = {}
    
    for metric_family in REGISTRY.collect():
        for sample in metric_family.samples:
            metric_data[sample.name] = sample.value
    
    # Create dashboard
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))
    
    # Request counts by status
    success_count = sum(v for k, v in metric_data.items() 
                       if 'llm_requests_total' in k and 'success' in k)
    error_count = sum(v for k, v in metric_data.items() 
                     if 'llm_requests_total' in k and 'error' in k)
    
    ax1.pie([success_count, error_count], labels=['Success', 'Error'], 
            autopct='%1.1f%%', colors=['green', 'red'])
    ax1.set_title('Request Success Rate')
    
    # Token processing by type
    input_tokens = sum(v for k, v in metric_data.items() 
                      if 'llm_tokens_processed_total' in k and 'input' in k)
    output_tokens = sum(v for k, v in metric_data.items() 
                       if 'llm_tokens_processed_total' in k and 'output' in k)
    
    ax2.bar(['Input Tokens', 'Output Tokens'], [input_tokens, output_tokens], 
            color=['blue', 'orange'])
    ax2.set_title('Token Processing')
    ax2.set_ylabel('Token Count')
    
    # System metrics
    gpu_util = metric_data.get('llm_gpu_utilization_percent', 0)
    queue_len = metric_data.get('llm_queue_length', 0)
    
    ax3.bar(['GPU Utilization %', 'Queue Length'], [gpu_util, queue_len], 
            color=['purple', 'brown'])
    ax3.set_title('System Metrics')
    
    # Cost by tenant
    tenant_costs = defaultdict(float)
    for k, v in metric_data.items():
        if 'llm_cost_usd_total' in k:
            # Extract tenant from metric name (simplified)
            if 'tenant_1' in k:
                tenant_costs['tenant_1'] += v
            elif 'tenant_2' in k:
                tenant_costs['tenant_2'] += v
            elif 'tenant_3' in k:
                tenant_costs['tenant_3'] += v
    
    if tenant_costs:
        tenants = list(tenant_costs.keys())
        costs = list(tenant_costs.values())
        ax4.bar(tenants, costs, color=['cyan', 'magenta', 'yellow'])
        ax4.set_title('Cost by Tenant')
        ax4.set_ylabel('Cost (USD)')
    else:
        ax4.text(0.5, 0.5, 'No cost data available', 
                ha='center', va='center', transform=ax4.transAxes)
        ax4.set_title('Cost by Tenant')
    
    plt.tight_layout()
    plt.show()
    
    # Print summary
    print("\n=== Metrics Summary ===")
    print(f"Total requests: {success_count + error_count}")
    print(f"Success rate: {success_count/(success_count + error_count)*100:.1f}%")
    print(f"Total tokens processed: {input_tokens + output_tokens}")
    print(f"GPU utilization: {gpu_util:.1f}%")
    print(f"Current queue length: {queue_len}")
    print(f"Total cost: ${sum(tenant_costs.values()):.4f}")

# Create dashboard
create_metrics_dashboard()

## 5. Alerting Configuration

Define alerting rules for critical metrics.

In [None]:
# Prometheus alerting rules
alerting_rules = """
# prometheus_alerts.yml
groups:
  - name: llm_service_alerts
    rules:
      # High error rate
      - alert: HighErrorRate
        expr: |
          (
            rate(llm_requests_total{status="error"}[5m]) /
            rate(llm_requests_total[5m])
          ) > 0.1
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "High error rate detected"
          description: "Error rate is {{ $value | humanizePercentage }} for {{ $labels.model_name }}"
      
      # High response time
      - alert: HighResponseTime
        expr: |
          histogram_quantile(0.95, rate(llm_request_duration_seconds_bucket[5m])) > 10
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High response time detected"
          description: "95th percentile response time is {{ $value }}s for {{ $labels.model_name }}"
      
      # Queue length too high
      - alert: HighQueueLength
        expr: llm_queue_length > 50
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Queue length is too high"
          description: "Queue length is {{ $value }} for {{ $labels.model_name }}"
      
      # GPU utilization too high
      - alert: HighGPUUtilization
        expr: llm_gpu_utilization_percent > 95
        for: 3m
        labels:
          severity: warning
        annotations:
          summary: "GPU utilization is very high"
          description: "GPU {{ $labels.gpu_id }} utilization is {{ $value }}%"
      
      # Low throughput
      - alert: LowThroughput
        expr: llm_tokens_per_second < 10
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Low token generation throughput"
          description: "Throughput is {{ $value }} tokens/sec for {{ $labels.model_name }}"
      
      # Service down
      - alert: ServiceDown
        expr: up{job="llm-service"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "LLM service is down"
          description: "LLM service {{ $labels.instance }} is not responding"
"""

# Write alerting rules
with open("prometheus_alerts.yml", "w") as f:
    f.write(alerting_rules)

print("Prometheus alerting rules created: prometheus_alerts.yml")
print("\nAlert Types:")
print("- High error rate (>10% for 2 minutes)")
print("- High response time (95th percentile >10s)")
print("- High queue length (>50 requests)")
print("- High GPU utilization (>95% for 3 minutes)")
print("- Low throughput (<10 tokens/sec)")
print("- Service down (no response for 1 minute)")

## Conclusion

Comprehensive metrics collection enables:

1. **Performance Monitoring**: Track request duration, throughput, and queue metrics
2. **Resource Monitoring**: Monitor GPU, memory, and system utilization
3. **Business Metrics**: Track costs, usage by tenant, and token consumption
4. **Proactive Alerting**: Detect issues before they impact users

**Key Metrics for LLM Services**:
- Request rate and success rate
- Token processing throughput
- Queue length and wait times
- GPU utilization and memory usage
- Cost per tenant and per request

Next, we'll explore distributed tracing for request flow analysis.