In [None]:
# Error Handling & Recovery - Cookbook Example 04

This notebook demonstrates comprehensive error handling strategies for WebSearcher agents, including retry mechanisms, fallback strategies, and recovery patterns.

## 🎯 What You'll Learn

- Different types of errors and their causes
- Retry mechanisms with exponential backoff
- Fallback strategies for degraded service
- Circuit breaker patterns for stability
- Error logging and monitoring
- Recovery strategies for failed operations
- Graceful degradation techniques

## 🛡️ Error Handling Benefits

1. **Reliability**: Robust systems that handle failures gracefully
2. **Resilience**: Automatic recovery from temporary issues
3. **Monitoring**: Comprehensive error tracking and alerting
4. **User Experience**: Smooth operation despite underlying failures
5. **Production Ready**: Enterprise-grade reliability patterns

Let's build bulletproof error handling for production use! 🚀


In [None]:
# Setup for error handling demonstration
import sys
import os
sys.path.insert(0, os.path.abspath('../../'))

# Initialize prompt system
import apps.research_prioritization.prompts.prompt_registry
from agents import WebSearcher

# Imports for error handling
import time
import random
import logging
from dataclasses import dataclass
from typing import Dict, List, Any, Optional, Callable
from datetime import datetime
from enum import Enum

# Configure logging for error tracking
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger('error_handling_demo')

# Client configuration
CLIENT_CONFIG = {
    "reasoning": {"effort": "medium"},
    "max_output_tokens": 3000
}

print("🛡️ Error Handling & Recovery System Ready!")
print(f"💻 Client Configuration: {CLIENT_CONFIG}")
print(f"📊 Logging configured for error tracking")


In [None]:
# Error handling framework with retry and circuit breaker patterns

class ErrorType(Enum):
    """Classification of different error types"""
    NETWORK_ERROR = "network"
    API_RATE_LIMIT = "rate_limit" 
    VALIDATION_ERROR = "validation"
    TIMEOUT_ERROR = "timeout"
    UNKNOWN_ERROR = "unknown"

@dataclass
class ErrorEvent:
    """Detailed error event for tracking and analysis"""
    timestamp: str
    error_type: ErrorType
    component: str
    disease_name: str
    error_message: str
    retry_attempt: int = 0
    
class CircuitBreaker:
    """Circuit breaker pattern for service protection"""
    def __init__(self, failure_threshold: int = 3, timeout: int = 60):
        self.failure_threshold = failure_threshold
        self.timeout = timeout
        self.failure_count = 0
        self.last_failure_time = None
        self.state = "CLOSED"  # CLOSED, OPEN, HALF_OPEN
        
    def is_open(self) -> bool:
        if self.state == "OPEN":
            if time.time() - self.last_failure_time > self.timeout:
                self.state = "HALF_OPEN"
                return False
            return True
        return False
    
    def record_success(self):
        self.failure_count = 0
        self.state = "CLOSED"
    
    def record_failure(self):
        self.failure_count += 1
        self.last_failure_time = time.time()
        if self.failure_count >= self.failure_threshold:
            self.state = "OPEN"
            logger.warning(f"Circuit breaker OPENED after {self.failure_count} failures")

class ResilientWebSearcher:
    """WebSearcher with comprehensive error handling and retry logic"""
    
    def __init__(self, prompt_alias: str, client_config: dict):
        self.prompt_alias = prompt_alias
        self.client_config = client_config
        self.searcher = WebSearcher(prompt_alias, client_config)
        self.circuit_breaker = CircuitBreaker()
        self.error_log: List[ErrorEvent] = []
        
    def classify_error(self, error: Exception) -> ErrorType:
        """Classify error type for appropriate handling"""
        error_str = str(error).lower()
        
        if "rate limit" in error_str or "429" in error_str:
            return ErrorType.API_RATE_LIMIT
        elif "timeout" in error_str or "timed out" in error_str:
            return ErrorType.TIMEOUT_ERROR
        elif "network" in error_str or "connection" in error_str:
            return ErrorType.NETWORK_ERROR
        elif "validation" in error_str or "pydantic" in error_str:
            return ErrorType.VALIDATION_ERROR
        else:
            return ErrorType.UNKNOWN_ERROR
    
    def should_retry(self, error_type: ErrorType, attempt: int) -> bool:
        """Determine if error should trigger a retry"""
        max_retries = {
            ErrorType.NETWORK_ERROR: 3,
            ErrorType.API_RATE_LIMIT: 2,
            ErrorType.TIMEOUT_ERROR: 2,
            ErrorType.VALIDATION_ERROR: 0,  # Don't retry validation errors
            ErrorType.UNKNOWN_ERROR: 1
        }
        return attempt < max_retries.get(error_type, 0)
    
    def get_retry_delay(self, error_type: ErrorType, attempt: int) -> float:
        """Calculate retry delay with exponential backoff"""
        base_delays = {
            ErrorType.NETWORK_ERROR: 2.0,
            ErrorType.API_RATE_LIMIT: 5.0,
            ErrorType.TIMEOUT_ERROR: 3.0,
            ErrorType.UNKNOWN_ERROR: 1.0
        }
        base_delay = base_delays.get(error_type, 1.0)
        return base_delay * (2 ** attempt) + random.uniform(0, 1)  # Add jitter
    
    def search_with_resilience(self, template_kwargs: dict, 
                             disease_name: str = None) -> Optional[Any]:
        """Execute search with full error handling and retry logic"""
        
        if disease_name is None:
            disease_name = template_kwargs.get('disease_name', 'Unknown')
        
        # Check circuit breaker
        if self.circuit_breaker.is_open():
            error_msg = "Circuit breaker is OPEN - service temporarily unavailable"
            logger.error(error_msg)
            self.log_error(ErrorType.UNKNOWN_ERROR, "circuit_breaker", 
                          disease_name, error_msg, 0)
            return None
        
        attempt = 0
        max_total_attempts = 5
        
        while attempt < max_total_attempts:
            try:
                logger.info(f"Attempting search for {disease_name} (attempt {attempt + 1})")
                result = self.searcher.search(template_kwargs)
                
                # Success! Reset circuit breaker
                self.circuit_breaker.record_success()
                if attempt > 0:
                    logger.info(f"✅ Successful recovery for {disease_name} after {attempt} retries")
                
                return result
                
            except Exception as e:
                error_type = self.classify_error(e)
                self.log_error(error_type, self.prompt_alias, disease_name, str(e), attempt)
                
                logger.warning(f"❌ Error on attempt {attempt + 1} for {disease_name}: {error_type.value}")
                
                # Check if we should retry
                if self.should_retry(error_type, attempt):
                    delay = self.get_retry_delay(error_type, attempt)
                    logger.info(f"⏳ Retrying in {delay:.1f}s...")
                    time.sleep(delay)
                    attempt += 1
                else:
                    logger.error(f"🛑 No more retries for {disease_name} - {error_type.value}")
                    self.circuit_breaker.record_failure()
                    break
        
        return None
    
    def log_error(self, error_type: ErrorType, component: str, 
                  disease_name: str, error_message: str, retry_attempt: int):
        """Log detailed error information"""
        error_event = ErrorEvent(
            timestamp=datetime.now().isoformat(),
            error_type=error_type,
            component=component,
            disease_name=disease_name,
            error_message=error_message,
            retry_attempt=retry_attempt
        )
        self.error_log.append(error_event)
    
    def get_error_summary(self) -> Dict[str, Any]:
        """Get comprehensive error summary"""
        if not self.error_log:
            return {"total_errors": 0, "error_types": {}, "recent_errors": []}
        
        error_types = {}
        for event in self.error_log:
            error_type = event.error_type.value
            error_types[error_type] = error_types.get(error_type, 0) + 1
        
        recent_errors = [
            {
                "timestamp": event.timestamp,
                "disease": event.disease_name,
                "type": event.error_type.value,
                "message": event.error_message[:50] + "..." if len(event.error_message) > 50 else event.error_message
            }
            for event in self.error_log[-5:]  # Last 5 errors
        ]
        
        return {
            "total_errors": len(self.error_log),
            "error_types": error_types,
            "circuit_breaker_state": self.circuit_breaker.state,
            "recent_errors": recent_errors
        }

# Initialize resilient searchers
resilient_socio = ResilientWebSearcher("socioeconomic_v2", CLIENT_CONFIG)
resilient_groups = ResilientWebSearcher("groups_v1", CLIENT_CONFIG)

print("🛡️ Resilient WebSearcher framework initialized!")
print("✅ Features: Retry logic, Circuit breaker, Error classification, Detailed logging")


In [None]:
## 🧪 Testing Error Resilience

Let's test the error handling capabilities with real scenarios and edge cases.


In [None]:
# Test resilient search with real disease analysis
template_data = {
    "orphacode": "905",
    "disease_name": "Wilson disease"
}

print("🧪 TESTING RESILIENT SEARCH")
print("=" * 40)

# Test socioeconomic analysis with error handling
print("📊 Testing socioeconomic analysis...")
socio_result = resilient_socio.search_with_resilience(template_data, "Wilson disease")

if socio_result:
    print(f"✅ Socioeconomic analysis successful!")
    print(f"   Score: {socio_result.score}")
    print(f"   Evidence Level: {socio_result.evidence_level}")
else:
    print("❌ Socioeconomic analysis failed after all retries")

print(f"\n👥 Testing groups analysis...")
groups_result = resilient_groups.search_with_resilience(template_data, "Wilson disease")

if groups_result:
    group_count = len(groups_result.groups) if groups_result.groups else 0
    print(f"✅ Groups analysis successful!")
    print(f"   Groups found: {group_count}")
else:
    print("❌ Groups analysis failed after all retries")

# Show error summaries
print(f"\n📋 ERROR SUMMARIES")
print("=" * 20)

socio_errors = resilient_socio.get_error_summary()
groups_errors = resilient_groups.get_error_summary()

print(f"📊 Socioeconomic Searcher:")
print(f"   Total errors: {socio_errors['total_errors']}")
print(f"   Circuit breaker: {socio_errors['circuit_breaker_state']}")

print(f"\n👥 Groups Searcher:")
print(f"   Total errors: {groups_errors['total_errors']}")
print(f"   Circuit breaker: {groups_errors['circuit_breaker_state']}")

if socio_errors['recent_errors'] or groups_errors['recent_errors']:
    print(f"\n⚠️  Recent errors detected - see detailed analysis below")


In [None]:
# Setup for error handling demonstration
import sys
import os
sys.path.insert(0, os.path.abspath('../../'))

# Initialize prompt system
import apps.research_prioritization.prompts.prompt_registry
from agents import WebSearcher

# Imports for error handling
import time
import random
import logging
from dataclasses import dataclass
from typing import Dict, List, Any, Optional, Callable
from datetime import datetime
from enum import Enum

# Configure logging for error tracking
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger('error_handling_demo')

# Client configuration
CLIENT_CONFIG = {
    "reasoning": {"effort": "medium"},
    "max_output_tokens": 3000
}

print("🛡️ Error Handling & Recovery System Ready!")
print(f"💻 Client Configuration: {CLIENT_CONFIG}")
print(f"📊 Logging configured for error tracking")
