# Day 35: GPU Monitoring for LLM Services - Part 4

Implementing comprehensive GPU monitoring for LLM services to track performance, utilization, and health.

## Overview
1. Setting up GPU monitoring with `pynvml`
2. Key GPU metrics for LLMs
3. GPU monitoring dashboard
4. Alerting on GPU metrics

In [None]:
# Install required packages
!pip install -q pynvml matplotlib

In [None]:
import time
import threading
import numpy as np
import matplotlib.pyplot as plt
from pynvml import *
from collections import defaultdict, deque

## 1. Setting Up GPU Monitoring

Initialize `pynvml` to access GPU metrics.

In [None]:
class GPUMonitor:
    """GPU monitoring using pynvml."""
    
    def __init__(self):
        self.is_initialized = False
        try:
            nvmlInit()
            self.device_count = nvmlDeviceGetCount()
            self.handles = [nvmlDeviceGetHandleByIndex(i) for i in range(self.device_count)]
            self.is_initialized = True
            print(f"NVML initialized. Found {self.device_count} GPUs.")
        except NVMLError as error:
            print(f"Failed to initialize NVML: {error}")
            print("GPU monitoring will be mocked.")
    
    def get_gpu_metrics(self) -> dict:
        """Get key GPU metrics."""
        if not self.is_initialized:
            return self._get_mock_metrics()
        
        metrics = {}
        for i, handle in enumerate(self.handles):
            utilization = nvmlDeviceGetUtilizationRates(handle)
            memory = nvmlDeviceGetMemoryInfo(handle)
            temperature = nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU)
            power = nvmlDeviceGetPowerUsage(handle) / 1000.0  # Watts
            
            metrics[f'gpu_{i}'] = {
                'utilization_gpu': utilization.gpu,
                'utilization_memory': utilization.memory,
                'memory_total': memory.total,
                'memory_used': memory.used,
                'memory_free': memory.free,
                'temperature': temperature,
                'power_usage': power
            }
        
        return metrics
    
    def _get_mock_metrics(self) -> dict:
        """Generate mock GPU metrics."""
        return {
            'gpu_0': {
                'utilization_gpu': np.random.uniform(60, 95),
                'utilization_memory': np.random.uniform(50, 80),
                'memory_total': 16e9,
                'memory_used': np.random.uniform(8e9, 14e9),
                'memory_free': 16e9 - np.random.uniform(8e9, 14e9),
                'temperature': np.random.uniform(65, 85),
                'power_usage': np.random.uniform(150, 250)
            }
        }
    
    def shutdown(self):
        """Shutdown NVML."""
        if self.is_initialized:
            nvmlShutdown()
            print("NVML shut down")

# Initialize GPU monitor
gpu_monitor = GPUMonitor()

## 2. GPU Metrics Collector

Collect GPU metrics over time.

In [None]:
class GPUMetricsCollector:
    """Collect GPU metrics over time."""
    
    def __init__(self, monitor: GPUMonitor, interval=5):
        self.monitor = monitor
        self.interval = interval
        self.history = defaultdict(lambda: deque(maxlen=100))
        self.running = False
    
    def start(self):
        """Start collecting metrics in the background."""
        self.running = True
        self.thread = threading.Thread(target=self._collect, daemon=True)
        self.thread.start()
        print("GPU metrics collector started")
    
    def stop(self):
        """Stop collecting metrics."""
        self.running = False
        if hasattr(self, 'thread'):
            self.thread.join()
        print("GPU metrics collector stopped")
    
    def _collect(self):
        """Background metrics collection loop."""
        while self.running:
            metrics = self.monitor.get_gpu_metrics()
            timestamp = time.time()
            
            for gpu_id, data in metrics.items():
                for key, value in data.items():
                    self.history[f'{gpu_id}_{key}'].append((timestamp, value))
            
            time.sleep(self.interval)
    
    def get_history(self) -> dict:
        """Get historical metrics."""
        return self.history

# Initialize and start collector
collector = GPUMetricsCollector(gpu_monitor, interval=1)
collector.start()

## 3. GPU Monitoring Dashboard

Create a simple dashboard to visualize GPU metrics.

In [None]:
def create_gpu_dashboard(history: dict):
    """Create a GPU monitoring dashboard."""
    
    if not history:
        print("No GPU metrics to display")
        return
    
    # Get GPU 0 data
    gpu_id = 'gpu_0'
    
    # Extract data for plotting
    util_gpu_data = history.get(f'{gpu_id}_utilization_gpu', [])
    util_mem_data = history.get(f'{gpu_id}_utilization_memory', [])
    mem_used_data = history.get(f'{gpu_id}_memory_used', [])
    temp_data = history.get(f'{gpu_id}_temperature', [])
    power_data = history.get(f'{gpu_id}_power_usage', [])
    
    if not util_gpu_data:
        print("No data for GPU 0")
        return
    
    # Timestamps
    timestamps = [t for t, v in util_gpu_data]
    
    # Create dashboard
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))
    
    # GPU and Memory Utilization
    ax1.plot(timestamps, [v for t, v in util_gpu_data], label='GPU Utilization')
    ax1.plot(timestamps, [v for t, v in util_mem_data], label='Memory Utilization')
    ax1.set_title('GPU and Memory Utilization')
    ax1.set_ylabel('Utilization (%)')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # Memory Usage
    ax2.plot(timestamps, [v / 1e9 for t, v in mem_used_data], label='Memory Used (GB)')
    ax2.set_title('GPU Memory Usage')
    ax2.set_ylabel('Memory (GB)')
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    # Temperature
    ax3.plot(timestamps, [v for t, v in temp_data], label='Temperature', color='red')
    ax3.axhline(y=85, color='r', linestyle='--', alpha=0.7, label='Critical Threshold')
    ax3.set_title('GPU Temperature')
    ax3.set_ylabel('Temperature (°C)')
    ax3.legend()
    ax3.grid(True, alpha=0.3)
    
    # Power Usage
    ax4.plot(timestamps, [v for t, v in power_data], label='Power Usage', color='purple')
    ax4.set_title('GPU Power Usage')
    ax4.set_ylabel('Power (W)')
    ax4.legend()
    ax4.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

# Simulate some activity and create dashboard
print("Collecting GPU metrics for 10 seconds...")
time.sleep(10)
collector.stop()

# Create dashboard
create_gpu_dashboard(collector.get_history())

## 4. Alerting on GPU Metrics

Create a simple alerting system for GPU metrics.

In [None]:
class GPUAlerter:
    """Simple alerting system for GPU metrics."""
    
    def __init__(self, thresholds: dict):
        self.thresholds = thresholds
        self.alerts = []
    
    def check_alerts(self, metrics: dict):
        """Check metrics against thresholds."""
        self.alerts.clear()
        
        for gpu_id, data in metrics.items():
            # Temperature alert
            if data['temperature'] > self.thresholds.get('max_temp', 85):
                self.alerts.append({
                    'gpu_id': gpu_id,
                    'metric': 'temperature',
                    'value': data['temperature'],
                    'threshold': self.thresholds['max_temp'],
                    'severity': 'critical'
                })
            
            # Memory usage alert
            mem_usage_percent = (data['memory_used'] / data['memory_total']) * 100
            if mem_usage_percent > self.thresholds.get('max_mem_percent', 90):
                self.alerts.append({
                    'gpu_id': gpu_id,
                    'metric': 'memory_usage',
                    'value': mem_usage_percent,
                    'threshold': self.thresholds['max_mem_percent'],
                    'severity': 'warning'
                })
            
            # GPU underutilization alert
            if data['utilization_gpu'] < self.thresholds.get('min_gpu_util', 10):
                self.alerts.append({
                    'gpu_id': gpu_id,
                    'metric': 'gpu_utilization',
                    'value': data['utilization_gpu'],
                    'threshold': self.thresholds['min_gpu_util'],
                    'severity': 'info'
                })
        
        return self.alerts

# Define alert thresholds
alert_thresholds = {
    'max_temp': 80,          # 80°C
    'max_mem_percent': 90, # 90%
    'min_gpu_util': 10       # 10%
}

# Initialize alerter
alerter = GPUAlerter(alert_thresholds)

# Check for alerts
current_metrics = gpu_monitor.get_gpu_metrics()
alerts = alerter.check_alerts(current_metrics)

print("\n=== GPU Alerts ===")
if alerts:
    for alert in alerts:
        print(f"[{alert['severity'].upper()}] {alert['gpu_id']}: {alert['metric']} is {alert['value']:.1f}, "
              f"threshold is {alert['threshold']}")
else:
    print("No active alerts. System is healthy.")

# Shutdown monitor
gpu_monitor.shutdown()

## Conclusion

GPU monitoring is crucial for LLM services:

1. **Performance Bottlenecks**: Identify underutilized GPUs or memory bottlenecks
2. **Hardware Health**: Monitor temperature and power to prevent damage
3. **Resource Optimization**: Ensure efficient use of expensive GPU resources
4. **Capacity Planning**: Understand usage patterns to plan for future needs

**Key Metrics**:
- GPU and memory utilization
- Memory usage (total, used, free)
- Temperature and power consumption
- Error counts and throttling events

Next, we'll integrate all observability components into a cohesive telemetry system.