# Tutorial 17: Monitoring ML Systems

## Module 7: Monitoring and Infrastructure

## Learning Objectives

1. Understand why ML systems fail in production
2. Implement operational metrics monitoring (latency, throughput, errors)
3. Track ML-specific metrics (prediction distribution, feature statistics)
4. Build logging and alerting systems
5. Design monitoring dashboards

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from collections import deque
import json
import random
from typing import Dict, List, Any, Optional
from dataclasses import dataclass, field
from enum import Enum
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
np.random.seed(42)
random.seed(42)
print('Setup complete!')

## 1. Why ML Systems Fail

Common failure modes include:
- **Data Issues**: Drift, missing data, schema changes
- **Model Issues**: Staleness, concept drift, bias
- **Infrastructure**: Latency, OOM errors, network issues

In [None]:
class FailureMode(Enum):
    DATA_DRIFT = 'data_drift'
    CONCEPT_DRIFT = 'concept_drift'
    MODEL_STALENESS = 'model_staleness'
    DATA_QUALITY = 'data_quality'
    INFRASTRUCTURE = 'infrastructure'

@dataclass
class FailureModeInfo:
    name: str
    description: str
    detection: str
    mitigation: str
    severity: str

FAILURES = {
    FailureMode.DATA_DRIFT: FailureModeInfo(
        'Data Drift', 'Input distribution changes', 'Statistical tests', 'Retrain model', 'high'),
    FailureMode.CONCEPT_DRIFT: FailureModeInfo(
        'Concept Drift', 'Feature-target relationship changes', 'Monitor accuracy', 'Retrain', 'critical'),
    FailureMode.MODEL_STALENESS: FailureModeInfo(
        'Model Staleness', 'Performance degrades over time', 'Track model age', 'Schedule retraining', 'medium'),
    FailureMode.DATA_QUALITY: FailureModeInfo(
        'Data Quality', 'Missing/corrupted data', 'Schema validation', 'Data validation pipeline', 'high'),
    FailureMode.INFRASTRUCTURE: FailureModeInfo(
        'Infrastructure', 'Resource exhaustion', 'Health checks', 'Auto-scaling', 'critical')
}

print('ML SYSTEM FAILURE MODES')
print('=' * 60)
for mode, info in FAILURES.items():
    print(f'\n[{info.severity.upper()}] {info.name}: {info.description}')

In [None]:
# Visualize failure impact
scenarios = ['Data Drift', 'Infrastructure', 'Staleness', 'Data Quality']
revenue_loss = [150000, 80000, 200000, 50000]
recovery_hours = [48, 4, 24, 8]

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
axes[0].barh(scenarios, revenue_loss, color=plt.cm.Reds(np.linspace(0.4, 0.8, 4)))
axes[0].set_xlabel('Revenue Loss ($)')
axes[0].set_title('Estimated Revenue Impact')
axes[1].barh(scenarios, recovery_hours, color=plt.cm.Blues(np.linspace(0.4, 0.8, 4)))
axes[1].set_xlabel('Hours')
axes[1].set_title('Time to Recovery')
plt.tight_layout()
plt.show()
print(f'Total potential loss: ${sum(revenue_loss):,}')

## 2. Operational Metrics Monitoring

Key metrics: Latency (P50, P95, P99), Throughput (QPS), Error Rate, Resource Utilization

In [None]:
@dataclass
class RequestMetrics:
    timestamp: datetime
    latency_ms: float
    success: bool
    cpu_util: float
    memory_util: float

class OperationalMetricsCollector:
    def __init__(self, window_size: int = 1000):
        self.metrics: deque = deque(maxlen=window_size)
    
    def record(self, latency_ms, success, cpu_util, memory_util):
        self.metrics.append(RequestMetrics(datetime.now(), latency_ms, success, cpu_util, memory_util))
    
    def compute_stats(self) -> Dict[str, float]:
        if not self.metrics:
            return {}
        latencies = [m.latency_ms for m in self.metrics]
        return {
            'latency_mean': np.mean(latencies),
            'latency_p50': np.percentile(latencies, 50),
            'latency_p95': np.percentile(latencies, 95),
            'latency_p99': np.percentile(latencies, 99),
            'error_rate': 1 - np.mean([m.success for m in self.metrics]),
            'cpu_mean': np.mean([m.cpu_util for m in self.metrics]),
            'memory_mean': np.mean([m.memory_util for m in self.metrics]),
            'count': len(self.metrics)
        }

collector = OperationalMetricsCollector()
print('OperationalMetricsCollector created!')

In [None]:
# Simulate production traffic
for _ in range(1000):
    latency = 50 * random.uniform(5, 20) if random.random() < 0.05 else np.random.lognormal(np.log(50), 0.5)
    collector.record(latency, random.random() > 0.02, np.random.beta(2, 5) * 100, np.clip(40 + np.random.normal(0, 10), 0, 100))

stats = collector.compute_stats()
print('Operational Metrics Summary')
print('=' * 40)
print(f'Latency P50: {stats["latency_p50"]:.1f}ms')
print(f'Latency P99: {stats["latency_p99"]:.1f}ms')
print(f'Error Rate: {stats["error_rate"]*100:.2f}%')
print(f'CPU: {stats["cpu_mean"]:.1f}%')

In [None]:
# Visualize operational metrics
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
metrics_list = list(collector.metrics)
latencies = [m.latency_ms for m in metrics_list]

ax = axes[0, 0]
ax.hist(latencies, bins=50, color='steelblue', alpha=0.7)
for p, c, l in [(50, 'green', 'P50'), (95, 'orange', 'P95'), (99, 'red', 'P99')]:
    ax.axvline(np.percentile(latencies, p), color=c, linestyle='--', label=f'{l}: {np.percentile(latencies, p):.0f}ms')
ax.set_xlabel('Latency (ms)')
ax.set_title('Latency Distribution')
ax.legend()
ax.set_xlim(0, np.percentile(latencies, 99.5))

ax = axes[0, 1]
cpu = pd.Series([m.cpu_util for m in metrics_list]).rolling(50).mean()
mem = pd.Series([m.memory_util for m in metrics_list]).rolling(50).mean()
ax.plot(cpu, label='CPU', color='blue')
ax.plot(mem, label='Memory', color='purple')
ax.axhline(80, color='red', linestyle='--', label='Warning')
ax.set_title('Resource Utilization')
ax.legend()
ax.set_ylim(0, 100)

ax = axes[1, 0]
success_rate = pd.Series([m.success for m in metrics_list]).rolling(100).mean() * 100
ax.plot(success_rate, color='green')
ax.axhline(99, color='orange', linestyle='--', label='SLA (99%)')
ax.set_title('Success Rate')
ax.set_ylim(95, 100.5)
ax.legend()

ax = axes[1, 1]
lat_ma = pd.Series(latencies).rolling(50).mean()
ax.plot(lat_ma, color='blue')
ax.axhline(500, color='red', linestyle='--', label='SLA Limit')
ax.set_title('Latency Trend')
ax.legend()

plt.tight_layout()
plt.suptitle('Operational Dashboard', y=1.02, fontsize=14, fontweight='bold')
plt.show()

## 3. ML-Specific Metrics

ML systems need specialized monitoring for prediction distribution, feature statistics, and model confidence.

In [None]:
@dataclass
class Prediction:
    timestamp: datetime
    prediction: float
    confidence: float
    features: Dict[str, float]

class MLMetricsCollector:
    def __init__(self, window_size: int = 5000):
        self.predictions: deque = deque(maxlen=window_size)
        self.reference_stats: Optional[Dict] = None
    
    def set_reference(self, stats: Dict):
        self.reference_stats = stats
    
    def record(self, pred: Prediction):
        self.predictions.append(pred)
    
    def get_stats(self) -> Dict[str, float]:
        if not self.predictions:
            return {}
        preds = [p.prediction for p in self.predictions]
        confs = [p.confidence for p in self.predictions]
        return {
            'pred_mean': np.mean(preds),
            'positive_rate': np.mean([p >= 0.5 for p in preds]),
            'conf_mean': np.mean(confs),
            'low_conf_rate': np.mean([c < 0.5 for c in confs])
        }

ml_collector = MLMetricsCollector()
print('MLMetricsCollector created!')

In [None]:
# Simulate predictions with drift at index 1500
ref = {'age': {'mean': 35, 'std': 10}, 'income': {'mean': 50000, 'std': 20000}, 'amount': {'mean': 100, 'std': 50}}
ml_collector.set_reference(ref)

for i in range(2000):
    is_drift = i >= 1500
    features = {k: np.random.normal(v['mean'] + (v['mean']*0.2 if is_drift else 0), v['std']) for k, v in ref.items()}
    prob = np.clip((0.15 if is_drift else 0.1) + (features['amount'] - 100) / 500, 0.01, 0.99)
    prediction = 1.0 if random.random() < prob else 0.0
    confidence = np.clip(prob + np.random.normal(0, 0.1), 0.1, 0.99)
    ml_collector.record(Prediction(datetime.now(), prediction, confidence, features))

ml_stats = ml_collector.get_stats()
print('ML Metrics Summary')
print('=' * 40)
print(f'Positive Rate: {ml_stats["positive_rate"]*100:.1f}%')
print(f'Avg Confidence: {ml_stats["conf_mean"]:.3f}')

In [None]:
# Visualize ML metrics
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
preds = list(ml_collector.predictions)
predictions = [p.prediction for p in preds]
confidences = [p.confidence for p in preds]

ax = axes[0, 0]
rolling = pd.Series(predictions).rolling(100).mean()
ax.plot(rolling, color='blue')
ax.axhline(0.1, color='green', linestyle='--', label='Expected (10%)')
ax.axvline(1500, color='red', linestyle='--', label='Drift Start')
ax.set_title('Prediction Rate Over Time')
ax.legend()

ax = axes[0, 1]
ax.hist(confidences, bins=50, color='purple', alpha=0.7)
ax.axvline(0.5, color='red', linestyle='--', label='Threshold')
ax.set_title('Confidence Distribution')
ax.legend()

ax = axes[1, 0]
before = [p.features['amount'] for p in preds[:1500]]
after = [p.features['amount'] for p in preds[1500:]]
ax.hist(before, bins=40, alpha=0.5, label='Before Drift', color='blue')
ax.hist(after, bins=40, alpha=0.5, label='After Drift', color='red')
ax.axvline(100, color='green', linestyle='--', label='Reference')
ax.set_title('Feature: amount')
ax.legend()

ax = axes[1, 1]
conf_rolling = pd.Series(confidences).rolling(100).mean()
ax.plot(conf_rolling, color='orange')
ax.axvline(1500, color='red', linestyle='--', label='Drift Start')
ax.set_title('Confidence Trend')
ax.legend()

plt.tight_layout()
plt.suptitle('ML Metrics Dashboard', y=1.02, fontsize=14, fontweight='bold')
plt.show()

## 4. Logging Infrastructure

In [None]:
class LogLevel(Enum):
    DEBUG = 'DEBUG'
    INFO = 'INFO'
    WARNING = 'WARNING'
    ERROR = 'ERROR'
    CRITICAL = 'CRITICAL'

@dataclass
class LogEntry:
    timestamp: datetime
    level: LogLevel
    component: str
    message: str
    metadata: Dict[str, Any] = field(default_factory=dict)

class MLLogger:
    def __init__(self, component: str):
        self.component = component
        self.entries: List[LogEntry] = []
        self._levels = {LogLevel.DEBUG: 0, LogLevel.INFO: 1, LogLevel.WARNING: 2, LogLevel.ERROR: 3, LogLevel.CRITICAL: 4}
        self.min_level = LogLevel.INFO
    
    def _log(self, level, message, metadata=None):
        if self._levels[level] >= self._levels[self.min_level]:
            self.entries.append(LogEntry(datetime.now(), level, self.component, message, metadata or {}))
    
    def info(self, msg, **kw): self._log(LogLevel.INFO, msg, kw.get('metadata'))
    def warning(self, msg, **kw): self._log(LogLevel.WARNING, msg, kw.get('metadata'))
    def error(self, msg, **kw): self._log(LogLevel.ERROR, msg, kw.get('metadata'))

logger = MLLogger('inference')
logger.info('Model loaded', metadata={'size': 125})
logger.warning('High latency', metadata={'latency': 450})
logger.error('Feature missing', metadata={'feature': 'embedding'})

print('Log Entries:')
for e in logger.entries:
    print(f'[{e.level.value}] {e.message} - {e.metadata}')

## 5. Alerting Systems

In [None]:
class AlertSeverity(Enum):
    INFO = 'info'
    WARNING = 'warning'
    CRITICAL = 'critical'

@dataclass
class AlertRule:
    name: str
    metric: str
    condition: str
    threshold: float
    severity: AlertSeverity

@dataclass
class Alert:
    timestamp: datetime
    rule_name: str
    value: float
    severity: AlertSeverity

class AlertingSystem:
    def __init__(self):
        self.rules: List[AlertRule] = []
        self.alerts: List[Alert] = []
    
    def add_rule(self, rule: AlertRule):
        self.rules.append(rule)
    
    def check(self, metrics: Dict[str, float]) -> List[Alert]:
        new_alerts = []
        for rule in self.rules:
            if rule.metric not in metrics:
                continue
            value = metrics[rule.metric]
            triggered = (rule.condition == 'gt' and value > rule.threshold) or (rule.condition == 'lt' and value < rule.threshold)
            if triggered:
                alert = Alert(datetime.now(), rule.name, value, rule.severity)
                self.alerts.append(alert)
                new_alerts.append(alert)
        return new_alerts

alerting = AlertingSystem()
alerting.add_rule(AlertRule('High Latency', 'latency_p99', 'gt', 500, AlertSeverity.WARNING))
alerting.add_rule(AlertRule('High Error Rate', 'error_rate', 'gt', 0.05, AlertSeverity.CRITICAL))
alerting.add_rule(AlertRule('Prediction Drift', 'positive_rate', 'gt', 0.15, AlertSeverity.WARNING))
print(f'Configured {len(alerting.rules)} alert rules')

In [None]:
# Test alerting
for i in range(5):
    is_anomaly = random.random() < 0.4
    metrics = {
        'latency_p99': 300 + (400 if is_anomaly else 50),
        'error_rate': 0.02 + (0.08 if is_anomaly else 0.01),
        'positive_rate': 0.1 + (0.1 if is_anomaly else 0.02)
    }
    alerts = alerting.check(metrics)
    if alerts:
        print(f'Iteration {i+1}: {len(alerts)} alerts')
        for a in alerts:
            print(f'  [{a.severity.value.upper()}] {a.rule_name}: {a.value:.2f}')

print(f'\nTotal alerts: {len(alerting.alerts)}')

## 6. Monitoring Dashboard

In [None]:
class MonitoringDashboard:
    def __init__(self, ops_collector, ml_collector, alerting):
        self.ops = ops_collector
        self.ml = ml_collector
        self.alerting = alerting
    
    def report(self) -> str:
        ops_stats = self.ops.compute_stats()
        ml_stats = self.ml.get_stats()
        lines = ['=' * 60, 'ML SYSTEM MONITORING REPORT', '=' * 60]
        lines.append(f'\nOperational: P50={ops_stats.get("latency_p50",0):.0f}ms, P99={ops_stats.get("latency_p99",0):.0f}ms, Error={ops_stats.get("error_rate",0)*100:.1f}%')
        lines.append(f'ML: Positive={ml_stats.get("positive_rate",0)*100:.1f}%, Confidence={ml_stats.get("conf_mean",0):.2f}')
        lines.append(f'Alerts: {len(self.alerting.alerts)} total')
        return '\n'.join(lines)

dashboard = MonitoringDashboard(collector, ml_collector, alerting)
print(dashboard.report())

## 7. Hands-on Exercise

**Task**: Build a monitoring system for a recommendation engine.

1. Create collectors for latency, error rate, and click-through rate
2. Set up alerts for degraded CTR and high latency
3. Simulate 1000 requests with occasional degradation
4. Generate a monitoring report

In [None]:
# Exercise: Implement your recommendation engine monitoring here

class RecommendationMonitor:
    def __init__(self):
        self.requests = []
        self.ctr_history = []
    
    def record_request(self, latency_ms, shown, clicked):
        self.requests.append({'latency': latency_ms, 'shown': shown, 'clicked': clicked})
        if len(self.requests) >= 100:
            recent = self.requests[-100:]
            ctr = sum(r['clicked'] for r in recent) / sum(r['shown'] for r in recent)
            self.ctr_history.append(ctr)
    
    def get_metrics(self):
        if not self.requests:
            return {}
        latencies = [r['latency'] for r in self.requests]
        return {
            'latency_p50': np.percentile(latencies, 50),
            'latency_p99': np.percentile(latencies, 99),
            'ctr': sum(r['clicked'] for r in self.requests) / sum(r['shown'] for r in self.requests)
        }

# Simulate
rec_monitor = RecommendationMonitor()
for i in range(1000):
    degraded = i > 700
    latency = np.random.lognormal(np.log(100 if degraded else 30), 0.5)
    shown = random.randint(5, 10)
    base_ctr = 0.05 if degraded else 0.15
    clicked = sum(random.random() < base_ctr for _ in range(shown))
    rec_monitor.record_request(latency, shown, clicked)

metrics = rec_monitor.get_metrics()
print('Recommendation Engine Metrics:')
print(f'Latency P50: {metrics["latency_p50"]:.1f}ms')
print(f'Latency P99: {metrics["latency_p99"]:.1f}ms')
print(f'CTR: {metrics["ctr"]*100:.2f}%')

## 8. Summary and Key Takeaways

### Key Concepts

1. **ML System Failures**: Data drift, concept drift, model staleness, infrastructure issues
2. **Operational Metrics**: Latency percentiles (P50, P95, P99), error rate, throughput, resource utilization
3. **ML Metrics**: Prediction distribution, feature statistics, confidence scores
4. **Logging**: Structured logs with component, level, metadata for debugging
5. **Alerting**: Rules-based alerts with severity levels and cooldowns

### Best Practices

- Monitor both operational AND ML-specific metrics
- Set up alerts BEFORE deploying to production
- Use percentiles (not averages) for latency monitoring
- Log predictions for debugging and retraining
- Establish baselines from training data for drift detection

### Next Steps

In the next tutorial, we will dive deeper into **Data Drift and Model Degradation**, covering statistical tests for drift detection, retraining strategies, and automated monitoring pipelines.