In [None]:
# Monitoring & Debugging - Cookbook Example 07

This notebook demonstrates comprehensive monitoring, debugging, and observability techniques for WebSearcher agents in production environments.

## 🎯 What You'll Learn

- Real-time monitoring and alerting
- Performance metrics and dashboards
- Debug logging and tracing
- Health checks and system status
- Error analysis and troubleshooting
- Production deployment patterns
- System observability best practices

## 📊 Monitoring Benefits

1. **Visibility**: Complete insight into system behavior
2. **Reliability**: Early detection of issues and degradation
3. **Performance**: Optimization based on real usage patterns
4. **Debugging**: Rapid troubleshooting and root cause analysis
5. **Business Intelligence**: Usage analytics and cost optimization

Let's build production-ready monitoring for research systems! 🚀


In [None]:
# Setup for monitoring and debugging
import sys
import os
sys.path.insert(0, os.path.abspath('../../'))

# Initialize prompt system
import apps.research_prioritization.prompts.prompt_registry
from agents import WebSearcher

# Monitoring and debugging imports
import time
import json
import logging
import traceback
import psutil
import threading
from dataclasses import dataclass, field, asdict
from typing import Dict, List, Any, Optional, Callable
from datetime import datetime, timedelta
from collections import defaultdict, deque
from enum import Enum

# Enhanced logging configuration
logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(name)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s',
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler('websearcher_debug.log')
    ]
)

# Specialized loggers
monitor_logger = logging.getLogger('monitor')
performance_logger = logging.getLogger('performance')
error_logger = logging.getLogger('error')

# Configuration
MONITORING_CONFIG = {
    "reasoning": {"effort": "medium"},
    "max_output_tokens": 3000
}

print("🔍 Monitoring & Debugging System Ready!")
print(f"💻 Configuration: {MONITORING_CONFIG}")
print(f"📊 Enhanced logging configured with file output")
print(f"🚀 Ready for production monitoring")


In [None]:
# Comprehensive monitoring and observability framework

class MetricType(Enum):
    """Types of metrics to track"""
    COUNTER = "counter"
    GAUGE = "gauge" 
    HISTOGRAM = "histogram"
    TIMER = "timer"

class AlertLevel(Enum):
    """Alert severity levels"""
    INFO = "info"
    WARNING = "warning"
    ERROR = "error"
    CRITICAL = "critical"

@dataclass
class Metric:
    """Individual metric data point"""
    name: str
    value: float
    metric_type: MetricType
    timestamp: datetime = field(default_factory=datetime.now)
    labels: Dict[str, str] = field(default_factory=dict)

@dataclass
class Alert:
    """System alert"""
    level: AlertLevel
    component: str
    message: str
    timestamp: datetime = field(default_factory=datetime.now)
    context: Dict[str, Any] = field(default_factory=dict)

@dataclass
class HealthCheck:
    """Health check result"""
    component: str
    status: str  # healthy, degraded, unhealthy
    response_time: float
    timestamp: datetime = field(default_factory=datetime.now)
    details: Dict[str, Any] = field(default_factory=dict)

class SystemMonitor:
    """Comprehensive system monitoring and observability"""
    
    def __init__(self, retention_hours: int = 24):
        self.retention_hours = retention_hours
        self.metrics: Dict[str, deque] = defaultdict(lambda: deque(maxlen=10000))
        self.alerts: deque = deque(maxlen=1000)
        self.health_checks: Dict[str, HealthCheck] = {}
        self.alert_rules: List[Callable] = []
        self.start_time = datetime.now()
        
        # System resource tracking
        self.system_metrics_thread = None
        self.monitoring_active = False
        
    def record_metric(self, name: str, value: float, metric_type: MetricType, labels: Dict[str, str] = None):
        """Record a metric data point"""
        metric = Metric(
            name=name,
            value=value,
            metric_type=metric_type,
            labels=labels or {}
        )
        self.metrics[name].append(metric)
        
        # Clean old metrics
        self._cleanup_old_metrics(name)
        
        # Check alert rules
        self._check_alert_rules(metric)
        
        monitor_logger.debug(f\"Recorded metric: {name}={value} ({metric_type.value})\")\n    \n    def record_alert(self, level: AlertLevel, component: str, message: str, context: Dict[str, Any] = None):\n        \"\"\"Record a system alert\"\"\"\n        alert = Alert(\n            level=level,\n            component=component,\n            message=message,\n            context=context or {}\n        )\n        self.alerts.append(alert)\n        \n        # Log alert\n        log_func = {\n            AlertLevel.INFO: monitor_logger.info,\n            AlertLevel.WARNING: monitor_logger.warning,\n            AlertLevel.ERROR: error_logger.error,\n            AlertLevel.CRITICAL: error_logger.critical\n        }[level]\n        \n        log_func(f\"ALERT [{level.value.upper()}] {component}: {message}\")\n    \n    def update_health_check(self, component: str, status: str, response_time: float, details: Dict[str, Any] = None):\n        \"\"\"Update component health status\"\"\"\n        health_check = HealthCheck(\n            component=component,\n            status=status,\n            response_time=response_time,\n            details=details or {}\n        )\n        self.health_checks[component] = health_check\n        \n        monitor_logger.info(f\"Health check: {component} = {status} ({response_time:.3f}s)\")\n        \n        # Generate alerts for unhealthy components\n        if status == \"unhealthy\":\n            self.record_alert(AlertLevel.ERROR, component, f\"Component {component} is unhealthy\", \n                            {\"response_time\": response_time, \"details\": details})\n        elif status == \"degraded\":\n            self.record_alert(AlertLevel.WARNING, component, f\"Component {component} is degraded\",\n                            {\"response_time\": response_time, \"details\": details})\n    \n    def add_alert_rule(self, rule: Callable[[Metric], Optional[Alert]]):\n        \"\"\"Add custom alert rule\"\"\"\n        self.alert_rules.append(rule)\n    \n    def start_system_monitoring(self):\n        \"\"\"Start background system resource monitoring\"\"\"\n        if self.monitoring_active:\n            return\n            \n        self.monitoring_active = True\n        self.system_metrics_thread = threading.Thread(target=self._collect_system_metrics, daemon=True)\n        self.system_metrics_thread.start()\n        monitor_logger.info(\"Started system resource monitoring\")\n    \n    def stop_system_monitoring(self):\n        \"\"\"Stop background monitoring\"\"\"\n        self.monitoring_active = False\n        if self.system_metrics_thread:\n            self.system_metrics_thread.join(timeout=5)\n        monitor_logger.info(\"Stopped system resource monitoring\")\n    \n    def _collect_system_metrics(self):\n        \"\"\"Background thread to collect system metrics\"\"\"\n        while self.monitoring_active:\n            try:\n                # CPU usage\n                cpu_percent = psutil.cpu_percent(interval=1)\n                self.record_metric(\"system_cpu_percent\", cpu_percent, MetricType.GAUGE)\n                \n                # Memory usage\n                memory = psutil.virtual_memory()\n                self.record_metric(\"system_memory_percent\", memory.percent, MetricType.GAUGE)\n                self.record_metric(\"system_memory_available_mb\", memory.available / 1024 / 1024, MetricType.GAUGE)\n                \n                # Disk usage\n                disk = psutil.disk_usage('/')\n                self.record_metric(\"system_disk_percent\", (disk.used / disk.total) * 100, MetricType.GAUGE)\n                \n                time.sleep(10)  # Collect every 10 seconds\n                \n            except Exception as e:\n                error_logger.error(f\"Error collecting system metrics: {e}\")\n                time.sleep(30)  # Wait longer on error\n    \n    def _cleanup_old_metrics(self, metric_name: str):\n        \"\"\"Remove metrics older than retention period\"\"\"\n        cutoff_time = datetime.now() - timedelta(hours=self.retention_hours)\n        metrics_queue = self.metrics[metric_name]\n        \n        while metrics_queue and metrics_queue[0].timestamp < cutoff_time:\n            metrics_queue.popleft()\n    \n    def _check_alert_rules(self, metric: Metric):\n        \"\"\"Check metric against alert rules\"\"\"\n        for rule in self.alert_rules:\n            try:\n                alert = rule(metric)\n                if alert:\n                    self.alerts.append(alert)\n            except Exception as e:\n                error_logger.error(f\"Alert rule failed: {e}\")\n    \n    def get_dashboard_data(self) -> Dict[str, Any]:\n        \"\"\"Get comprehensive dashboard data\"\"\"\n        now = datetime.now()\n        uptime = now - self.start_time\n        \n        # Recent metrics summary\n        recent_metrics = {}\n        for name, metrics_queue in self.metrics.items():\n            if metrics_queue:\n                latest = metrics_queue[-1]\n                recent_metrics[name] = {\n                    \"value\": latest.value,\n                    \"timestamp\": latest.timestamp.isoformat(),\n                    \"type\": latest.metric_type.value\n                }\n        \n        # Recent alerts\n        recent_alerts = [\n            {\n                \"level\": alert.level.value,\n                \"component\": alert.component,\n                \"message\": alert.message,\n                \"timestamp\": alert.timestamp.isoformat()\n            }\n            for alert in list(self.alerts)[-10:]  # Last 10 alerts\n        ]\n        \n        # Health status summary\n        health_summary = {\n            component: {\n                \"status\": check.status,\n                \"response_time\": check.response_time,\n                \"timestamp\": check.timestamp.isoformat()\n            }\n            for component, check in self.health_checks.items()\n        }\n        \n        return {\n            \"timestamp\": now.isoformat(),\n            \"uptime_seconds\": uptime.total_seconds(),\n            \"metrics\": recent_metrics,\n            \"alerts\": recent_alerts,\n            \"health_checks\": health_summary,\n            \"total_metrics_collected\": sum(len(q) for q in self.metrics.values()),\n            \"total_alerts\": len(self.alerts)\n        }\n\nclass MonitoredWebSearcher:\n    \"\"\"WebSearcher with comprehensive monitoring integration\"\"\"\n    \n    def __init__(self, prompt_alias: str, client_config: dict, monitor: SystemMonitor):\n        self.prompt_alias = prompt_alias\n        self.client_config = client_config\n        self.searcher = WebSearcher(prompt_alias, client_config)\n        self.monitor = monitor\n        self.request_count = 0\n        \n    def search_with_monitoring(self, template_kwargs: dict) -> Any:\n        \"\"\"Execute search with comprehensive monitoring\"\"\"\n        request_id = f\"{self.prompt_alias}_{self.request_count}\"\n        self.request_count += 1\n        \n        start_time = time.time()\n        disease_name = template_kwargs.get('disease_name', 'Unknown')\n        \n        # Record request start\n        self.monitor.record_metric(\n            f\"requests_total\", \n            1, \n            MetricType.COUNTER, \n            {\"prompt_alias\": self.prompt_alias, \"disease\": disease_name}\n        )\n        \n        monitor_logger.info(f\"Starting search request {request_id} for {disease_name}\")\n        \n        try:\n            # Execute search\n            result = self.searcher.search(template_kwargs)\n            \n            # Record success metrics\n            response_time = time.time() - start_time\n            self.monitor.record_metric(\n                f\"request_duration_seconds\",\n                response_time,\n                MetricType.TIMER,\n                {\"prompt_alias\": self.prompt_alias, \"status\": \"success\"}\n            )\n            \n            self.monitor.record_metric(\n                f\"requests_successful_total\",\n                1,\n                MetricType.COUNTER,\n                {\"prompt_alias\": self.prompt_alias}\n            )\n            \n            # Record result-specific metrics\n            if hasattr(result, 'score'):\n                self.monitor.record_metric(\n                    f\"result_score\",\n                    float(result.score),\n                    MetricType.GAUGE,\n                    {\"prompt_alias\": self.prompt_alias, \"disease\": disease_name}\n                )\n            \n            performance_logger.info(\n                f\"Request {request_id} completed successfully in {response_time:.3f}s\"\n            )\n            \n            # Update health check\n            self.monitor.update_health_check(\n                self.prompt_alias,\n                \"healthy\" if response_time < 5.0 else \"degraded\",\n                response_time,\n                {\"last_request_id\": request_id, \"result_type\": type(result).__name__}\n            )\n            \n            return result\n            \n        except Exception as e:\n            # Record error metrics\n            response_time = time.time() - start_time\n            \n            self.monitor.record_metric(\n                f\"requests_failed_total\",\n                1,\n                MetricType.COUNTER,\n                {\"prompt_alias\": self.prompt_alias, \"error_type\": type(e).__name__}\n            )\n            \n            self.monitor.record_metric(\n                f\"request_duration_seconds\",\n                response_time,\n                MetricType.TIMER,\n                {\"prompt_alias\": self.prompt_alias, \"status\": \"error\"}\n            )\n            \n            # Record alert\n            self.monitor.record_alert(\n                AlertLevel.ERROR,\n                self.prompt_alias,\n                f\"Search request failed: {str(e)}\",\n                {\n                    \"request_id\": request_id,\n                    \"disease_name\": disease_name,\n                    \"error_type\": type(e).__name__,\n                    \"traceback\": traceback.format_exc()\n                }\n            )\n            \n            # Update health check\n            self.monitor.update_health_check(\n                self.prompt_alias,\n                \"unhealthy\",\n                response_time,\n                {\"last_error\": str(e), \"request_id\": request_id}\n            )\n            \n            error_logger.error(\n                f\"Request {request_id} failed after {response_time:.3f}s: {e}\",\n                exc_info=True\n            )\n            \n            raise  # Re-raise the exception\n\n# Initialize monitoring system\nsystem_monitor = SystemMonitor(retention_hours=24)\n\n# Add custom alert rules\ndef high_error_rate_rule(metric: Metric) -> Optional[Alert]:\n    \"\"\"Alert on high error rates\"\"\"\n    if metric.name == \"requests_failed_total\" and metric.value > 5:\n        return Alert(\n            level=AlertLevel.WARNING,\n            component=\"error_rate\",\n            message=f\"High error rate detected: {metric.value} failures\",\n            context={\"metric\": asdict(metric)}\n        )\n    return None\n\ndef slow_response_rule(metric: Metric) -> Optional[Alert]:\n    \"\"\"Alert on slow responses\"\"\"\n    if metric.name == \"request_duration_seconds\" and metric.value > 10.0:\n        return Alert(\n            level=AlertLevel.WARNING,\n            component=\"performance\",\n            message=f\"Slow response detected: {metric.value:.2f}s\",\n            context={\"metric\": asdict(metric)}\n        )\n    return None\n\nsystem_monitor.add_alert_rule(high_error_rate_rule)\nsystem_monitor.add_alert_rule(slow_response_rule)\n\n# Initialize monitored searchers\nmonitored_socio = MonitoredWebSearcher(\"socioeconomic_v2\", MONITORING_CONFIG, system_monitor)\nmonitored_groups = MonitoredWebSearcher(\"groups_v1\", MONITORING_CONFIG, system_monitor)\n\n# Start system monitoring\nsystem_monitor.start_system_monitoring()\n\nprint(\"🔍 Comprehensive monitoring system initialized!\")\nprint(\"✅ Features: Metrics collection, Alerting, Health checks, Performance tracking\")\nprint(\"📊 System resource monitoring started\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 📊 Live Monitoring Dashboard\n",
    "\n",
    "Let's demonstrate the monitoring system with real requests and dashboard views."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Execute monitored requests to generate metrics\n",
    "print(\"🧪 EXECUTING MONITORED REQUESTS\")\n",
    "print(\"=\" * 35)\n",
    "\n",
    "test_diseases = [\n",
    "    (\"905\", \"Wilson disease\"),\n",
    "    (\"399\", \"Huntington disease\"),\n",
    "    (\"98\", \"Alpers syndrome\")\n",
    "]\n",
    "\n",
    "results = []\n",
    "\n",
    "for orphacode, disease_name in test_diseases:\n",
    "    template_data = {\"orphacode\": orphacode, \"disease_name\": disease_name}\n",
    "    \n",
    "    print(f\"\\n🔬 Analyzing {disease_name}...\")\n",
    "    \n",
    "    try:\n",
    "        # Socioeconomic analysis\n",
    "        socio_result = monitored_socio.search_with_monitoring(template_data)\n",
    "        print(f\"   📊 Socioeconomic: Score {socio_result.score if hasattr(socio_result, 'score') else 'N/A'}\")\n",
    "        \n",
    "        # Groups analysis\n",
    "        groups_result = monitored_groups.search_with_monitoring(template_data)\n",
    "        group_count = len(groups_result.groups) if hasattr(groups_result, 'groups') and groups_result.groups else 0\n",
    "        print(f\"   👥 Groups: {group_count} found\")\n",
    "        \n",
    "        results.append((disease_name, socio_result, groups_result))\n",
    "        \n",
    "    except Exception as e:\n",
    "        print(f\"   ❌ Error: {str(e)[:50]}...\")\n",
    "        results.append((disease_name, None, None))\n",
    "    \n",
    "    # Small delay to see metrics accumulate\n",
    "    time.sleep(1)\n",
    "\n",
    "print(f\"\\n✅ Completed {len(results)} disease analyses\")\n",
    "print(\"📊 Metrics and alerts have been collected\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Display comprehensive monitoring dashboard\n",
    "dashboard_data = system_monitor.get_dashboard_data()\n",
    "\n",
    "print(\"📊 MONITORING DASHBOARD\")\n",
    "print(\"=\" * 25)\n",
    "\n",
    "# System overview\n",
    "uptime_hours = dashboard_data['uptime_seconds'] / 3600\n",
    "print(f\"⏱️  System Uptime: {uptime_hours:.2f} hours\")\n",
    "print(f\"📈 Total Metrics: {dashboard_data['total_metrics_collected']}\")\n",
    "print(f\"🚨 Total Alerts: {dashboard_data['total_alerts']}\")\n",
    "\n",
    "# Health status\n",
    "print(f\"\\n💚 COMPONENT HEALTH STATUS\")\n",
    "print(\"-\" * 30)\n",
    "if dashboard_data['health_checks']:\n",
    "    for component, health in dashboard_data['health_checks'].items():\n",
    "        status_emoji = {\n",
    "            \"healthy\": \"✅\",\n",
    "            \"degraded\": \"⚠️\",\n",
    "            \"unhealthy\": \"❌\"\n",
    "        }.get(health['status'], \"❓\")\n",
    "        \n",
    "        print(f\"{status_emoji} {component}: {health['status']} ({health['response_time']:.3f}s)\")\nelse:\n    print(\"No health checks available\")\n\n# Recent metrics\nprint(f\"\\n📊 RECENT METRICS\")\nprint(\"-\" * 20)\nif dashboard_data['metrics']:\n    for name, metric in dashboard_data['metrics'].items():\n        if 'system_' not in name:  # Focus on application metrics\n            print(f\"   {name}: {metric['value']} ({metric['type']})\")\nelse:\n    print(\"No recent metrics available\")\n\n# System resources\nprint(f\"\\n🖥️  SYSTEM RESOURCES\")\nprint(\"-\" * 20)\nfor name, metric in dashboard_data['metrics'].items():\n    if name.startswith('system_'):\n        metric_name = name.replace('system_', '').replace('_', ' ').title()\n        unit = \"%\" if \"percent\" in name else \"MB\" if \"mb\" in name else \"\"\n        print(f\"   {metric_name}: {metric['value']:.1f}{unit}\")\n\n# Recent alerts\nprint(f\"\\n🚨 RECENT ALERTS\")\nprint(\"-\" * 15)\nif dashboard_data['alerts']:\n    for alert in dashboard_data['alerts'][-5:]:  # Last 5 alerts\n        level_emoji = {\n            \"info\": \"ℹ️\",\n            \"warning\": \"⚠️\",\n            \"error\": \"❌\",\n            \"critical\": \"🔥\"\n        }.get(alert['level'], \"❓\")\n        \n        timestamp = datetime.fromisoformat(alert['timestamp']).strftime(\"%H:%M:%S\")\n        print(f\"   {level_emoji} [{timestamp}] {alert['component']}: {alert['message'][:60]}...\")\nelse:\n    print(\"   No recent alerts\")\n\n# Performance summary\nrequest_metrics = {\n    name: metric for name, metric in dashboard_data['metrics'].items() \n    if 'request' in name or 'duration' in name\n}\n\nif request_metrics:\n    print(f\"\\n⚡ PERFORMANCE SUMMARY\")\n    print(\"-\" * 22)\n    for name, metric in request_metrics.items():\n        print(f\"   {name}: {metric['value']}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 🔧 Debugging and Troubleshooting\n",
    "\n",
    "Let's demonstrate debugging capabilities with simulated issues."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Debugging utilities and troubleshooting tools\n",
    "\n",
    "def generate_debug_report(monitor: SystemMonitor) -> Dict[str, Any]:\n",
    "    \"\"\"Generate comprehensive debug report\"\"\"\n    \n",
    "    dashboard = monitor.get_dashboard_data()\n",
    "    \n",
    "    # Error analysis\n",
    "    error_alerts = [\n",
    "        alert for alert in monitor.alerts \n",
    "        if alert.level in [AlertLevel.ERROR, AlertLevel.CRITICAL]\n",
    "    ]\n",
    "    \n",
    "    # Performance analysis\n",
    "    slow_requests = []\n",
    "    for metric_name, metrics_queue in monitor.metrics.items():\n",
    "        if \"duration\" in metric_name:\n",
    "            for metric in metrics_queue:\n",
    "                if metric.value > 5.0:  # Requests slower than 5s\n",
    "                    slow_requests.append({\n",
    "                        \"metric\": metric_name,\n",
    "                        \"duration\": metric.value,\n",
    "                        \"timestamp\": metric.timestamp.isoformat(),\n",
    "                        \"labels\": metric.labels\n",
    "                    })\n",
    "    \n",
    "    # Component health analysis\n",
    "    unhealthy_components = {\n",
    "        comp: health for comp, health in monitor.health_checks.items()\n",
    "        if health.status in [\"degraded\", \"unhealthy\"]\n",
    "    }\n",
    "    \n",
    "    return {\n",
    "        \"generated_at\": datetime.now().isoformat(),\n",
    "        \"system_overview\": dashboard,\n",
    "        \"error_analysis\": {\n",
    "            \"total_errors\": len(error_alerts),\n",
    "            \"recent_errors\": [asdict(alert) for alert in error_alerts[-10:]]\n",
    "        },\n",
    "        \"performance_analysis\": {\n",
    "            \"slow_requests_count\": len(slow_requests),\n",
    "            \"slow_requests\": slow_requests[-5:]  # Last 5 slow requests\n",
    "        },\n",
    "        \"health_analysis\": {\n",
    "            \"unhealthy_components_count\": len(unhealthy_components),\n",
    "            \"unhealthy_components\": {\n",
    "                comp: asdict(health) for comp, health in unhealthy_components.items()\n",
    "            }\n",
    "        }\n",
    "    }\n",
    "\n",
    "def simulate_error_scenario():\n",
    "    \"\"\"Simulate an error scenario for debugging demonstration\"\"\"\n",
    "    print(\"🧪 SIMULATING ERROR SCENARIO\")\n",
    "    print(\"-\" * 30)\n",
    "    \n",
    "    # Simulate some failures\n",
    "    try:\n",
    "        # This will fail due to invalid orphacode\n",
    "        invalid_data = {\"orphacode\": \"INVALID\", \"disease_name\": \"Test Disease\"}\n",
    "        monitored_socio.search_with_monitoring(invalid_data)\n",
    "    except Exception as e:\n",
    "        print(f\"✅ Expected error captured: {type(e).__name__}\")\n",
    "    \n",
    "    # Record some manual alerts for demonstration\n",
    "    system_monitor.record_alert(\n",
    "        AlertLevel.WARNING,\n",
    "        \"test_component\",\n",
    "        \"Simulated degraded performance\",\n",
    "        {\"test_scenario\": True}\n",
    "    )\n",
    "    \n",
    "    system_monitor.record_alert(\n",
    "        AlertLevel.ERROR,\n",
    "        \"data_validation\",\n",
    "        \"Invalid input data detected\",\n",
    "        {\"invalid_orphacode\": \"INVALID\"}\n",
    "    )\n",
    "    \n",
    "    print(\"🚨 Error scenario simulation complete\")\n",
    "\n",
    "# Run error simulation\n",
    "simulate_error_scenario()\n",
    "\n",
    "# Generate debug report\n",
    "print(\"\\n🔍 GENERATING DEBUG REPORT\")\n",
    "print(\"=\" * 30)\n",
    "\n",
    "debug_report = generate_debug_report(system_monitor)\n",
    "\n",
    "print(f\"📊 Debug Report Generated at: {debug_report['generated_at']}\")\nprint(f\"\\n❌ ERROR ANALYSIS:\")\nprint(f\"   Total errors: {debug_report['error_analysis']['total_errors']}\")\nif debug_report['error_analysis']['recent_errors']:\n    print(f\"   Recent errors:\")\n    for error in debug_report['error_analysis']['recent_errors'][-3:]:\n        print(f\"     • [{error['level']}] {error['component']}: {error['message'][:50]}...\")\n\nprint(f\"\\n⚡ PERFORMANCE ANALYSIS:\")\nprint(f\"   Slow requests: {debug_report['performance_analysis']['slow_requests_count']}\")\nif debug_report['performance_analysis']['slow_requests']:\n    print(f\"   Recent slow requests:\")\n    for req in debug_report['performance_analysis']['slow_requests']:\n        print(f\"     • {req['metric']}: {req['duration']:.2f}s\")\n\nprint(f\"\\n💚 HEALTH ANALYSIS:\")\nprint(f\"   Unhealthy components: {debug_report['health_analysis']['unhealthy_components_count']}\")\nif debug_report['health_analysis']['unhealthy_components']:\n    for comp, health in debug_report['health_analysis']['unhealthy_components'].items():\n        print(f\"     • {comp}: {health['status']} ({health['response_time']:.3f}s)\")\n\n# Save debug report to file\nwith open('debug_report.json', 'w') as f:\n    json.dump(debug_report, f, indent=2, default=str)\nprint(f\"\\n💾 Debug report saved to 'debug_report.json'\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 📚 Key Learnings & Best Practices\n",
    "\n",
    "### ✅ What We Accomplished\n",
    "\n",
    "1. **Comprehensive Monitoring**: Metrics, alerts, and health checks\n",
    "2. **Real-time Observability**: Live dashboard with system insights\n",
    "3. **Error Tracking**: Detailed error analysis and troubleshooting\n",
    "4. **Performance Monitoring**: Response times and throughput tracking\n",
    "5. **System Health**: Resource usage and component status monitoring\n",
    "6. **Debug Reporting**: Automated troubleshooting and analysis tools\n",
    "\n",
    "### 🎯 Production Monitoring Essentials\n",
    "\n",
    "- **Four Golden Signals**: Latency, traffic, errors, and saturation\n",
    "- **Proactive Alerting**: Catch issues before they impact users\n",
    "- **Health Checks**: Regular component status verification\n",
    "- **Debug Tools**: Rapid troubleshooting capabilities\n",
    "- **Historical Data**: Trend analysis and capacity planning\n",
    "\n",
    "### 📊 Monitoring Stack Components\n",
    "\n",
    "- **Metrics Collection**: Custom metrics with labels and metadata\n",
    "- **Alert Management**: Severity-based alerting with context\n",
    "- **Health Monitoring**: Component status and response time tracking\n",
    "- **System Resources**: CPU, memory, and disk usage monitoring\n",
    "- **Performance Analytics**: Request duration and throughput analysis\n",
    "\n",
    "### 🚀 Next Steps for Production\n",
    "\n",
    "- **External Monitoring**: Integrate with Prometheus, Grafana, or DataDog\n",
    "- **Log Aggregation**: Centralized logging with ELK stack or similar\n",
    "- **Distributed Tracing**: Request tracing across microservices\n",
    "- **Automated Remediation**: Self-healing systems and auto-scaling\n",
    "- **Business Metrics**: Track research impact and cost optimization\n",
    "\n",
    "### 🛡️ Security and Compliance\n",
    "\n",
    "- **Audit Logging**: Complete request and response logging\n",
    "- **Privacy Protection**: Ensure no sensitive data in logs\n",
    "- **Access Control**: Secure monitoring dashboard access\n",
    "- **Compliance Reporting**: Automated compliance and audit reports\n",
    "\n",
    "The monitoring and debugging framework ensures production-ready observability! 🎊"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Cleanup - stop monitoring\n",
    "print(\"🧹 CLEANUP\")\n",
    "print(\"=\" * 10)\n",
    "\n",
    "system_monitor.stop_system_monitoring()\n",
    "print(\"✅ System monitoring stopped\")\n",
    "\n",
    "# Final dashboard snapshot\n",
    "final_dashboard = system_monitor.get_dashboard_data()\n",
    "print(f\"📊 Final metrics count: {final_dashboard['total_metrics_collected']}\")\n",
    "print(f\"🚨 Final alerts count: {final_dashboard['total_alerts']}\")\n",
    "print(f\"⏱️  Total uptime: {final_dashboard['uptime_seconds']:.1f} seconds\")\n",
    "\n",
    "print(\"\\n🎊 Monitoring demonstration complete!\")\n",
    "print(\"📖 Check 'websearcher_debug.log' and 'debug_report.json' for detailed logs\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
