# Lab 2.5-01: 監控系統建置

## 實驗目標

本節將建立完整的 vLLM 性能監控基礎設施，包括：
- Prometheus 時間序列資料庫配置
- vLLM 服務的 metrics 端點設置
- Grafana 視覺化儀表板創建
- 基礎監控指標驗證

## 監控架構概覽

```
┌─────────────────┐    HTTP /metrics    ┌─────────────────┐
│   vLLM Server   │◄────────────────────│   Prometheus    │
│  (Port 8000)    │                     │   (Port 9090)   │
│  /metrics       │                     │                 │
└─────────────────┘                     └─────────────────┘
                                                  │
                                         PromQL Queries
                                                  │
                                                  ▼
                                        ┌─────────────────┐
                                        │    Grafana      │
                                        │   (Port 3000)   │
                                        │   Dashboard     │
                                        └─────────────────┘
```

## 1. 環境準備與依賴安裝

In [None]:
# 安裝監控相關依賴套件
!pip install prometheus-client
!pip install psutil
!pip install nvidia-ml-py3
!pip install requests
!pip install matplotlib seaborn

In [None]:
import os
import time
import threading
import subprocess
import requests
import json
from pathlib import Path

# Prometheus 客戶端
from prometheus_client import start_http_server, Gauge, Counter, Histogram, CollectorRegistry
from prometheus_client.core import REGISTRY

# 系統監控
import psutil
try:
    import pynvml
    NVIDIA_GPU_AVAILABLE = True
    pynvml.nvmlInit()
except ImportError:
    NVIDIA_GPU_AVAILABLE = False
    print("警告: NVIDIA GPU 監控不可用")

# 數據處理與視覺化
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

print("✅ 所有依賴套件已載入完成")

## 2. vLLM 服務配置與 Metrics 端點設置

In [None]:
# 配置 vLLM 服務的監控參數
VLLM_CONFIG = {
    "model": "microsoft/DialoGPT-medium",  # 使用較小的模型進行測試
    "host": "127.0.0.1",
    "port": 8000,
    "metrics_port": 8001,  # 專用的 metrics 端點
    "max_model_len": 1024,
    "gpu_memory_utilization": 0.7
}

print(f"vLLM 配置:")
for key, value in VLLM_CONFIG.items():
    print(f"  {key}: {value}")

In [None]:
# 建立 vLLM 啟動腳本
vllm_script = f"""
#!/bin/bash

# 啟動 vLLM 服務並啟用監控
python -m vllm.entrypoints.openai.api_server \
    --model {VLLM_CONFIG['model']} \
    --host {VLLM_CONFIG['host']} \
    --port {VLLM_CONFIG['port']} \
    --max-model-len {VLLM_CONFIG['max_model_len']} \
    --gpu-memory-utilization {VLLM_CONFIG['gpu_memory_utilization']} \
    --enable-metrics \
    --metrics-port {VLLM_CONFIG['metrics_port']}
"""

# 儲存啟動腳本
script_path = "start_vllm_with_metrics.sh"
with open(script_path, "w") as f:
    f.write(vllm_script)

# 賦予執行權限
os.chmod(script_path, 0o755)

print(f"✅ vLLM 啟動腳本已建立: {script_path}")
print("\n腳本內容:")
print(vllm_script)

## 3. 自定義 Metrics 收集器

In [None]:
class VLLMMetricsCollector:
    """vLLM 自定義指標收集器"""
    
    def __init__(self, vllm_metrics_url="http://127.0.0.1:8001/metrics"):
        self.vllm_metrics_url = vllm_metrics_url
        self.registry = CollectorRegistry()
        
        # 定義自定義指標
        self.request_latency = Histogram(
            'vllm_request_latency_seconds',
            'Request latency in seconds',
            buckets=[0.1, 0.5, 1.0, 2.0, 5.0, 10.0],
            registry=self.registry
        )
        
        self.active_requests = Gauge(
            'vllm_active_requests',
            'Number of active requests',
            registry=self.registry
        )
        
        self.tokens_generated = Counter(
            'vllm_tokens_generated_total',
            'Total number of tokens generated',
            registry=self.registry
        )
        
        self.gpu_memory_usage = Gauge(
            'vllm_gpu_memory_usage_bytes',
            'GPU memory usage in bytes',
            ['gpu_id'],
            registry=self.registry
        )
        
        self.cpu_usage = Gauge(
            'vllm_cpu_usage_percent',
            'CPU usage percentage',
            registry=self.registry
        )
    
    def collect_system_metrics(self):
        """收集系統級指標"""
        # CPU 使用率
        cpu_percent = psutil.cpu_percent(interval=1)
        self.cpu_usage.set(cpu_percent)
        
        # GPU 指標 (如果可用)
        if NVIDIA_GPU_AVAILABLE:
            try:
                device_count = pynvml.nvmlDeviceGetCount()
                for i in range(device_count):
                    handle = pynvml.nvmlDeviceGetHandleByIndex(i)
                    mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
                    self.gpu_memory_usage.labels(gpu_id=str(i)).set(mem_info.used)
            except Exception as e:
                print(f"GPU 指標收集失敗: {e}")
    
    def get_vllm_native_metrics(self):
        """獲取 vLLM 原生指標"""
        try:
            response = requests.get(self.vllm_metrics_url, timeout=5)
            if response.status_code == 200:
                return response.text
            else:
                return None
        except requests.exceptions.RequestException:
            return None
    
    def start_metrics_server(self, port=8002):
        """啟動 Prometheus metrics 伺服器"""
        start_http_server(port, registry=self.registry)
        print(f"✅ Metrics 伺服器已啟動在 http://127.0.0.1:{port}/metrics")
    
    def simulate_request_metrics(self, latency=1.5, tokens=150):
        """模擬請求指標 (用於測試)"""
        self.request_latency.observe(latency)
        self.tokens_generated.inc(tokens)
        self.active_requests.inc()
        
        # 模擬請求完成
        threading.Timer(latency, lambda: self.active_requests.dec()).start()

# 初始化 metrics 收集器
metrics_collector = VLLMMetricsCollector()
print("✅ Metrics 收集器已初始化")

## 4. Prometheus 配置檔案生成

In [None]:
# 建立 Prometheus 配置檔案
prometheus_config = {
    "global": {
        "scrape_interval": "15s",
        "evaluation_interval": "15s"
    },
    "scrape_configs": [
        {
            "job_name": "vllm-native",
            "static_configs": [
                {
                    "targets": [f"127.0.0.1:{VLLM_CONFIG['metrics_port']}"]
                }
            ],
            "scrape_interval": "5s",
            "metrics_path": "/metrics"
        },
        {
            "job_name": "vllm-custom",
            "static_configs": [
                {
                    "targets": ["127.0.0.1:8002"]
                }
            ],
            "scrape_interval": "5s",
            "metrics_path": "/metrics"
        },
        {
            "job_name": "prometheus",
            "static_configs": [
                {
                    "targets": ["127.0.0.1:9090"]
                }
            ]
        }
    ]
}

# 將配置轉換為 YAML 格式
import yaml

prometheus_yaml = yaml.dump(prometheus_config, default_flow_style=False)

# 儲存 Prometheus 配置檔案
with open("prometheus.yml", "w") as f:
    f.write(prometheus_yaml)

print("✅ Prometheus 配置檔案已生成: prometheus.yml")
print("\n配置內容:")
print(prometheus_yaml)

## 5. Grafana 儀表板配置

In [None]:
# 建立 Grafana 儀表板 JSON 配置
grafana_dashboard = {
    "dashboard": {
        "id": None,
        "title": "vLLM Performance Monitoring",
        "tags": ["vllm", "llm", "performance"],
        "timezone": "browser",
        "refresh": "5s",
        "time": {
            "from": "now-30m",
            "to": "now"
        },
        "panels": [
            {
                "id": 1,
                "title": "Request Latency",
                "type": "graph",
                "targets": [
                    {
                        "expr": "histogram_quantile(0.95, rate(vllm_request_latency_seconds_bucket[5m]))",
                        "legendFormat": "P95 Latency"
                    },
                    {
                        "expr": "histogram_quantile(0.50, rate(vllm_request_latency_seconds_bucket[5m]))",
                        "legendFormat": "P50 Latency"
                    }
                ],
                "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
                "yAxes": [
                    {
                        "label": "Seconds",
                        "min": 0
                    }
                ]
            },
            {
                "id": 2,
                "title": "Active Requests",
                "type": "singlestat",
                "targets": [
                    {
                        "expr": "vllm_active_requests",
                        "legendFormat": "Active Requests"
                    }
                ],
                "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0}
            },
            {
                "id": 3,
                "title": "Token Generation Rate",
                "type": "graph",
                "targets": [
                    {
                        "expr": "rate(vllm_tokens_generated_total[5m])",
                        "legendFormat": "Tokens/sec"
                    }
                ],
                "gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
                "yAxes": [
                    {
                        "label": "Tokens per second",
                        "min": 0
                    }
                ]
            },
            {
                "id": 4,
                "title": "GPU Memory Usage",
                "type": "graph",
                "targets": [
                    {
                        "expr": "vllm_gpu_memory_usage_bytes",
                        "legendFormat": "GPU {{gpu_id}} Memory"
                    }
                ],
                "gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
                "yAxes": [
                    {
                        "label": "Bytes",
                        "min": 0
                    }
                ]
            }
        ]
    },
    "overwrite": True
}

# 儲存 Grafana 儀表板配置
with open("vllm_dashboard.json", "w") as f:
    json.dump(grafana_dashboard, f, indent=2)

print("✅ Grafana 儀表板配置已生成: vllm_dashboard.json")

## 6. 監控服務啟動與驗證

In [None]:
# 啟動自定義 metrics 伺服器
metrics_collector.start_metrics_server(port=8002)

# 開始收集系統指標
def collect_metrics_loop():
    while True:
        metrics_collector.collect_system_metrics()
        time.sleep(5)

# 在背景執行指標收集
metrics_thread = threading.Thread(target=collect_metrics_loop, daemon=True)
metrics_thread.start()

print("✅ 系統指標收集已啟動")

In [None]:
# 驗證 metrics 端點
def verify_metrics_endpoint(url, name):
    try:
        response = requests.get(url, timeout=5)
        if response.status_code == 200:
            print(f"✅ {name} metrics 端點正常: {url}")
            # 顯示部分 metrics 內容
            lines = response.text.split('\n')[:10]
            print(f"   前 10 行 metrics:")
            for line in lines:
                if line.strip():
                    print(f"     {line}")
            return True
        else:
            print(f"❌ {name} metrics 端點異常: {url} (狀態碼: {response.status_code})")
            return False
    except requests.exceptions.RequestException as e:
        print(f"❌ {name} metrics 端點無法連接: {url} ({e})")
        return False

# 驗證自定義 metrics
custom_metrics_ok = verify_metrics_endpoint("http://127.0.0.1:8002/metrics", "自定義")

# 嘗試驗證 vLLM 原生 metrics (如果服務在運行)
vllm_metrics_ok = verify_metrics_endpoint(f"http://127.0.0.1:{VLLM_CONFIG['metrics_port']}/metrics", "vLLM 原生")

if not vllm_metrics_ok:
    print("\n💡 提示: vLLM 服務尚未啟動，請使用以下命令啟動:")
    print(f"   bash {script_path}")

## 7. 模擬監控數據生成

In [None]:
import random
import numpy as np

# 模擬生成監控數據
def simulate_monitoring_data(duration_minutes=5):
    """模擬生成監控數據"""
    print(f"開始模擬 {duration_minutes} 分鐘的監控數據...")
    
    end_time = time.time() + duration_minutes * 60
    request_count = 0
    
    while time.time() < end_time:
        # 模擬請求延遲 (正態分佈)
        latency = max(0.1, np.random.normal(1.5, 0.5))
        
        # 模擬生成的 token 數量
        tokens = random.randint(50, 300)
        
        # 記錄 metrics
        metrics_collector.simulate_request_metrics(latency, tokens)
        
        request_count += 1
        
        # 隨機間隔
        time.sleep(random.uniform(0.5, 3.0))
        
        if request_count % 10 == 0:
            print(f"已模擬 {request_count} 個請求...")
    
    print(f"✅ 模擬完成，總共生成 {request_count} 個請求的監控數據")

# 在背景執行模擬數據生成
simulation_thread = threading.Thread(
    target=simulate_monitoring_data, 
    args=(2,),  # 模擬 2 分鐘
    daemon=True
)
simulation_thread.start()

print("📊 監控數據模擬已開始...")

## 8. 監控指標即時視覺化

In [None]:
def visualize_current_metrics():
    """視覺化當前監控指標"""
    plt.style.use('seaborn-v0_8')
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle('vLLM 即時監控指標', fontsize=16, fontweight='bold')
    
    # 獲取當前指標值
    try:
        response = requests.get("http://127.0.0.1:8002/metrics", timeout=5)
        metrics_text = response.text
        
        # 解析 metrics (簡化版)
        cpu_usage = psutil.cpu_percent()
        memory_usage = psutil.virtual_memory().percent
        
        # CPU 使用率
        axes[0, 0].bar(['CPU'], [cpu_usage], color='skyblue')
        axes[0, 0].set_title('CPU 使用率 (%)')
        axes[0, 0].set_ylim(0, 100)
        axes[0, 0].set_ylabel('使用率 (%)')
        
        # 記憶體使用率
        axes[0, 1].bar(['Memory'], [memory_usage], color='lightcoral')
        axes[0, 1].set_title('記憶體使用率 (%)')
        axes[0, 1].set_ylim(0, 100)
        axes[0, 1].set_ylabel('使用率 (%)')
        
        # GPU 資訊 (如果可用)
        if NVIDIA_GPU_AVAILABLE:
            try:
                handle = pynvml.nvmlDeviceGetHandleByIndex(0)
                gpu_util = pynvml.nvmlDeviceGetUtilizationRates(handle)
                mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
                gpu_mem_percent = (mem_info.used / mem_info.total) * 100
                
                axes[1, 0].bar(['GPU Compute', 'GPU Memory'], 
                              [gpu_util.gpu, gpu_mem_percent], 
                              color=['orange', 'purple'])
                axes[1, 0].set_title('GPU 使用率 (%)')
                axes[1, 0].set_ylim(0, 100)
                axes[1, 0].set_ylabel('使用率 (%)')
            except:
                axes[1, 0].text(0.5, 0.5, 'GPU 資訊\n無法取得', 
                               ha='center', va='center', transform=axes[1, 0].transAxes)
                axes[1, 0].set_title('GPU 使用率 (N/A)')
        else:
            axes[1, 0].text(0.5, 0.5, 'GPU 不可用', 
                           ha='center', va='center', transform=axes[1, 0].transAxes)
            axes[1, 0].set_title('GPU 使用率 (N/A)')
        
        # Metrics 可用性狀態
        endpoints = [
            ('自定義 Metrics', 'http://127.0.0.1:8002/metrics'),
            ('vLLM Metrics', f'http://127.0.0.1:{VLLM_CONFIG["metrics_port"]}/metrics')
        ]
        
        statuses = []
        labels = []
        colors = []
        
        for name, url in endpoints:
            try:
                resp = requests.get(url, timeout=2)
                if resp.status_code == 200:
                    statuses.append(1)
                    colors.append('green')
                else:
                    statuses.append(0)
                    colors.append('red')
            except:
                statuses.append(0)
                colors.append('red')
            labels.append(name)
        
        axes[1, 1].bar(labels, statuses, color=colors)
        axes[1, 1].set_title('Metrics 端點狀態')
        axes[1, 1].set_ylim(0, 1.2)
        axes[1, 1].set_ylabel('狀態 (1=正常, 0=異常)')
        axes[1, 1].set_xticklabels(labels, rotation=45)
        
    except Exception as e:
        print(f"視覺化錯誤: {e}")
    
    plt.tight_layout()
    plt.show()

# 生成即時監控視覺化
visualize_current_metrics()

## 9. 部署腳本生成

In [None]:
# 生成完整的部署腳本
deployment_script = """
#!/bin/bash

# vLLM 監控系統部署腳本
echo "🚀 開始部署 vLLM 監控系統..."

# 檢查依賴
echo "📋 檢查系統依賴..."
command -v docker >/dev/null 2>&1 || { echo "❌ Docker 未安裝"; exit 1; }
command -v docker-compose >/dev/null 2>&1 || { echo "❌ Docker Compose 未安裝"; exit 1; }

# 建立目錄結構
mkdir -p monitoring/{prometheus,grafana}
cp prometheus.yml monitoring/prometheus/
cp vllm_dashboard.json monitoring/grafana/

# 生成 Docker Compose 檔案
cat > monitoring/docker-compose.yml << 'EOF'
version: '3.8'

services:
  prometheus:
    image: prom/prometheus:latest
    container_name: vllm-prometheus
    ports:
      - "9090:9090"
    volumes:
      - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--web.console.libraries=/etc/prometheus/console_libraries'
      - '--web.console.templates=/etc/prometheus/consoles'
      - '--storage.tsdb.retention.time=200h'
      - '--web.enable-lifecycle'
    networks:
      - monitoring

  grafana:
    image: grafana/grafana:latest
    container_name: vllm-grafana
    ports:
      - "3000:3000"
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=admin
    volumes:
      - grafana-storage:/var/lib/grafana
      - ./grafana:/etc/grafana/provisioning/dashboards
    networks:
      - monitoring

networks:
  monitoring:
    driver: bridge

volumes:
  grafana-storage:
EOF

echo "✅ 部署檔案已生成"
echo "📍 執行以下命令啟動監控服務:"
echo "   cd monitoring"
echo "   docker-compose up -d"
echo ""
echo "🌐 服務端點:"
echo "   Prometheus: http://localhost:9090"
echo "   Grafana: http://localhost:3000 (admin/admin)"
echo "   vLLM API: http://localhost:8000"
echo "   vLLM Metrics: http://localhost:8001/metrics"
echo "   Custom Metrics: http://localhost:8002/metrics"
"""

# 儲存部署腳本
with open("deploy_monitoring.sh", "w") as f:
    f.write(deployment_script)

os.chmod("deploy_monitoring.sh", 0o755)

print("✅ 部署腳本已生成: deploy_monitoring.sh")
print("\n執行 `bash deploy_monitoring.sh` 來部署完整的監控系統")

## 10. 監控設置驗證總結

In [None]:
# 監控設置總結報告
def generate_setup_summary():
    print("\n" + "="*60)
    print("🎯 vLLM 監控系統建置完成總結")
    print("="*60)
    
    # 檢查生成的檔案
    files_to_check = [
        "start_vllm_with_metrics.sh",
        "prometheus.yml",
        "vllm_dashboard.json",
        "deploy_monitoring.sh"
    ]
    
    print("\n📁 生成的設定檔案:")
    for filename in files_to_check:
        if os.path.exists(filename):
            size = os.path.getsize(filename)
            print(f"   ✅ {filename} ({size} bytes)")
        else:
            print(f"   ❌ {filename} (遺失)")
    
    # 檢查服務狀態
    print("\n🌐 服務端點狀態:")
    endpoints = {
        "自定義 Metrics": "http://127.0.0.1:8002/metrics",
        "vLLM Metrics": f"http://127.0.0.1:{VLLM_CONFIG['metrics_port']}/metrics",
        "vLLM API": f"http://127.0.0.1:{VLLM_CONFIG['port']}/v1/models"
    }
    
    for name, url in endpoints.items():
        try:
            response = requests.get(url, timeout=3)
            if response.status_code == 200:
                print(f"   ✅ {name}: {url}")
            else:
                print(f"   ⚠️  {name}: {url} (狀態: {response.status_code})")
        except requests.exceptions.RequestException:
            print(f"   ❌ {name}: {url} (無法連接)")
    
    print("\n🚀 後續步驟:")
    print("   1. 啟動 vLLM 服務: bash start_vllm_with_metrics.sh")
    print("   2. 部署監控服務: bash deploy_monitoring.sh")
    print("   3. 訪問 Grafana: http://localhost:3000 (admin/admin)")
    print("   4. 導入儀表板: vllm_dashboard.json")
    print("   5. 繼續進行 02-Real_Time_Metrics.ipynb")
    
    print("\n" + "="*60)

generate_setup_summary()

## 實驗總結

本實驗成功建立了完整的 vLLM 監控基礎設施，包括：

### ✅ 完成項目
1. **監控架構設計**: 建立了 Prometheus + Grafana 的監控技術棧
2. **Metrics 收集器**: 開發了自定義的 vLLM 指標收集器
3. **配置檔案生成**: 自動生成了 Prometheus 和 Grafana 的配置
4. **部署腳本**: 建立了一鍵部署監控系統的腳本
5. **即時視覺化**: 實現了基礎的監控指標視覺化

### 🎯 核心成果
- **自定義 Metrics 伺服器**: 在 port 8002 提供系統級監控指標
- **vLLM 整合**: 準備好與 vLLM 原生 metrics 的整合
- **Grafana 儀表板**: 預配置的效能監控儀表板
- **Docker 化部署**: 容器化的監控服務部署方案

### 📋 下一步
繼續進行 **02-Real_Time_Metrics.ipynb**，學習實時監控指標的收集與分析。