# Day 35: Distributed Tracing for LLM Services - Part 3

Implementing distributed tracing to understand request flows and performance bottlenecks in LLM services.

## Overview
1. Setting up OpenTelemetry tracing
2. Instrumenting LLM operations
3. Trace analysis and visualization
4. Performance optimization insights

In [None]:
# Install required packages
!pip install -q opentelemetry-api opentelemetry-sdk opentelemetry-instrumentation

In [None]:
import time
import uuid
import json
from typing import Dict, Any, Optional
from contextlib import contextmanager

from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter
from opentelemetry.trace import Status, StatusCode

## 1. Setting Up OpenTelemetry Tracing

Configure OpenTelemetry for distributed tracing.

In [None]:
def setup_tracing():
    """Setup OpenTelemetry tracing."""
    
    # Create tracer provider
    trace.set_tracer_provider(TracerProvider())
    
    # Create console exporter for demo
    console_exporter = ConsoleSpanExporter()
    
    # Create span processor
    span_processor = BatchSpanProcessor(console_exporter)
    
    # Add processor to tracer provider
    trace.get_tracer_provider().add_span_processor(span_processor)
    
    # Get tracer
    tracer = trace.get_tracer("llm-service", "1.0.0")
    
    return tracer

# Initialize tracing
tracer = setup_tracing()
print("OpenTelemetry tracing configured")

## 2. Traced LLM Service

Create an LLM service with comprehensive tracing.

In [None]:
class TracedLLMService:
    """LLM service with distributed tracing."""
    
    def __init__(self, model_name="gpt-3.5-turbo"):
        self.model_name = model_name
        self.tracer = tracer
    
    def generate(self, prompt: str, tenant_id: str, max_tokens: int = 100) -> Dict[str, Any]:
        """Generate text with distributed tracing."""
        
        # Start root span
        with self.tracer.start_as_current_span(
            "llm_generate",
            attributes={
                "llm.model_name": self.model_name,
                "llm.tenant_id": tenant_id,
                "llm.max_tokens": max_tokens,
                "llm.input_length": len(prompt)
            }
        ) as span:
            
            try:
                # Input validation span
                result = self._validate_input(prompt, max_tokens)
                if result:
                    return result
                
                # Tokenization span
                input_tokens = self._tokenize(prompt)
                
                # Model loading span
                self._load_model()
                
                # Inference span
                output_tokens, generated_text = self._run_inference(prompt, max_tokens)
                
                # Post-processing span
                result = self._post_process(input_tokens, output_tokens, generated_text)
                
                # Add result attributes to span
                span.set_attributes({
                    "llm.input_tokens": input_tokens,
                    "llm.output_tokens": output_tokens,
                    "llm.total_tokens": input_tokens + output_tokens,
                    "llm.cost": result['cost']
                })
                
                span.set_status(Status(StatusCode.OK))
                return result
                
            except Exception as e:
                span.record_exception(e)
                span.set_status(Status(StatusCode.ERROR, str(e)))
                raise
    
    def _validate_input(self, prompt: str, max_tokens: int) -> Optional[Dict]:
        """Validate input with tracing."""
        
        with self.tracer.start_as_current_span(
            "validate_input",
            attributes={
                "validation.prompt_empty": not prompt.strip(),
                "validation.max_tokens_valid": max_tokens > 0
            }
        ) as span:
            
            if not prompt.strip():
                span.add_event("Validation failed: empty prompt")
                span.set_status(Status(StatusCode.ERROR, "Empty prompt"))
                raise ValueError("Prompt cannot be empty")
            
            if max_tokens <= 0:
                span.add_event("Validation failed: invalid max_tokens")
                span.set_status(Status(StatusCode.ERROR, "Invalid max_tokens"))
                raise ValueError("max_tokens must be positive")
            
            span.add_event("Input validation passed")
            span.set_status(Status(StatusCode.OK))
            return None
    
    def _tokenize(self, prompt: str) -> int:
        """Tokenize input with tracing."""
        
        with self.tracer.start_as_current_span(
            "tokenize",
            attributes={"tokenizer.input_length": len(prompt)}
        ) as span:
            
            # Simulate tokenization
            time.sleep(0.01)
            tokens = len(prompt.split())
            
            span.set_attributes({
                "tokenizer.token_count": tokens,
                "tokenizer.compression_ratio": len(prompt) / tokens if tokens > 0 else 0
            })
            
            span.add_event(f"Tokenized {tokens} tokens")
            return tokens
    
    def _load_model(self):
        """Load model with tracing."""
        
        with self.tracer.start_as_current_span(
            "load_model",
            attributes={"model.name": self.model_name}
        ) as span:
            
            # Simulate model loading
            time.sleep(0.05)
            
            span.add_event("Model loaded successfully")
            span.set_attributes({
                "model.loaded": True,
                "model.memory_usage": "4.2GB"  # Mock value
            })
    
    def _run_inference(self, prompt: str, max_tokens: int) -> tuple:
        """Run inference with tracing."""
        
        with self.tracer.start_as_current_span(
            "inference",
            attributes={
                "inference.max_tokens": max_tokens,
                "inference.model": self.model_name
            }
        ) as span:
            
            # Simulate inference phases
            self._run_attention(prompt)
            output_tokens, generated_text = self._generate_tokens(max_tokens)
            
            span.set_attributes({
                "inference.output_tokens": output_tokens,
                "inference.tokens_per_second": output_tokens / 0.5  # Mock calculation
            })
            
            return output_tokens, generated_text
    
    def _run_attention(self, prompt: str):
        """Run attention mechanism with tracing."""
        
        with self.tracer.start_as_current_span(
            "attention",
            attributes={"attention.sequence_length": len(prompt.split())}
        ) as span:
            
            # Simulate attention computation
            time.sleep(0.1)
            
            span.add_event("Attention computation complete")
            span.set_attributes({
                "attention.heads": 32,
                "attention.layers": 24
            })
    
    def _generate_tokens(self, max_tokens: int) -> tuple:
        """Generate tokens with tracing."""
        
        with self.tracer.start_as_current_span(
            "token_generation",
            attributes={"generation.max_tokens": max_tokens}
        ) as span:
            
            # Simulate token generation
            time.sleep(max_tokens * 0.01)
            
            output_tokens = min(max_tokens, 20)  # Mock output
            generated_text = f" Generated response with {output_tokens} tokens."
            
            span.set_attributes({
                "generation.actual_tokens": output_tokens,
                "generation.completion_reason": "length"
            })
            
            return output_tokens, generated_text
    
    def _post_process(self, input_tokens: int, output_tokens: int, text: str) -> Dict[str, Any]:
        """Post-process results with tracing."""
        
        with self.tracer.start_as_current_span(
            "post_process",
            attributes={
                "post_process.input_tokens": input_tokens,
                "post_process.output_tokens": output_tokens
            }
        ) as span:
            
            # Calculate cost
            total_tokens = input_tokens + output_tokens
            cost = total_tokens * 0.001
            
            span.set_attributes({
                "post_process.total_tokens": total_tokens,
                "post_process.cost": cost
            })
            
            return {
                'text': text,
                'usage': {
                    'prompt_tokens': input_tokens,
                    'completion_tokens': output_tokens,
                    'total_tokens': total_tokens
                },
                'cost': cost
            }

# Initialize traced service
traced_service = TracedLLMService()
print("Traced LLM service initialized")

## 3. Testing Distributed Tracing

Generate requests to see traces in action.

In [None]:
def test_tracing():
    """Test distributed tracing with various scenarios."""
    
    print("=== Testing Successful Request ===")
    try:
        result = traced_service.generate(
            prompt="What is artificial intelligence?",
            tenant_id="tenant_123",
            max_tokens=30
        )
        print(f"Success: {result['usage']['total_tokens']} tokens generated")
    except Exception as e:
        print(f"Error: {e}")
    
    print("\n" + "="*50 + "\n")
    
    print("=== Testing Error Case ===")
    try:
        result = traced_service.generate(
            prompt="",  # Empty prompt to trigger error
            tenant_id="tenant_456",
            max_tokens=20
        )
    except Exception as e:
        print(f"Expected error: {e}")

# Run tracing test
test_tracing()

## 4. Trace Analysis

Create a simple trace analyzer to extract insights.

In [None]:
class TraceAnalyzer:
    """Analyze traces for performance insights."""
    
    def __init__(self):
        self.traces = []
    
    def add_mock_trace(self, operation: str, duration_ms: float, attributes: dict):
        """Add a mock trace for analysis."""
        self.traces.append({
            'operation': operation,
            'duration_ms': duration_ms,
            'attributes': attributes,
            'timestamp': time.time()
        })
    
    def analyze_performance(self):
        """Analyze trace performance."""
        if not self.traces:
            print("No traces to analyze")
            return
        
        # Group by operation
        operations = {}
        for trace in self.traces:
            op = trace['operation']
            if op not in operations:
                operations[op] = []
            operations[op].append(trace['duration_ms'])
        
        print("=== Trace Performance Analysis ===")
        
        total_time = 0
        for op, durations in operations.items():
            avg_duration = sum(durations) / len(durations)
            max_duration = max(durations)
            min_duration = min(durations)
            
            print(f"\n{op}:")
            print(f"  Count: {len(durations)}")
            print(f"  Avg: {avg_duration:.2f}ms")
            print(f"  Max: {max_duration:.2f}ms")
            print(f"  Min: {min_duration:.2f}ms")
            
            total_time += sum(durations)
        
        print(f"\nTotal trace time: {total_time:.2f}ms")
        
        # Find bottlenecks
        bottlenecks = []
        for op, durations in operations.items():
            avg_duration = sum(durations) / len(durations)
            if avg_duration > 100:  # Operations taking >100ms
                bottlenecks.append((op, avg_duration))
        
        if bottlenecks:
            print("\nPerformance Bottlenecks:")
            for op, duration in sorted(bottlenecks, key=lambda x: x[1], reverse=True):
                print(f"  {op}: {duration:.2f}ms")
        else:
            print("\nNo significant bottlenecks detected")

# Create analyzer and add mock traces
analyzer = TraceAnalyzer()

# Add mock trace data (simulating real traces)
mock_traces = [
    ('llm_generate', 520.5, {'model': 'gpt-3.5-turbo', 'tokens': 45}),
    ('validate_input', 2.1, {'valid': True}),
    ('tokenize', 8.3, {'tokens': 15}),
    ('load_model', 45.2, {'model': 'gpt-3.5-turbo'}),
    ('inference', 380.8, {'tokens': 30}),
    ('attention', 120.4, {'layers': 24}),
    ('token_generation', 245.1, {'tokens': 30}),
    ('post_process', 12.6, {'cost': 0.045})
]

for op, duration, attrs in mock_traces:
    analyzer.add_mock_trace(op, duration, attrs)

# Analyze performance
analyzer.analyze_performance()

## 5. Production Tracing Configuration

Create production-ready tracing configuration.

In [None]:
# Production tracing configuration
production_tracing_config = """
# production_tracing.py
import os
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.jaeger.thrift import JaegerExporter
from opentelemetry.sdk.resources import Resource

def setup_production_tracing():
    """Setup production tracing with Jaeger."""
    
    # Create resource with service information
    resource = Resource.create({
        "service.name": os.getenv("SERVICE_NAME", "llm-service"),
        "service.version": os.getenv("SERVICE_VERSION", "1.0.0"),
        "deployment.environment": os.getenv("ENVIRONMENT", "production")
    })
    
    # Create tracer provider
    trace.set_tracer_provider(TracerProvider(resource=resource))
    
    # Configure Jaeger exporter
    jaeger_exporter = JaegerExporter(
        agent_host_name=os.getenv("JAEGER_AGENT_HOST", "localhost"),
        agent_port=int(os.getenv("JAEGER_AGENT_PORT", "6831")),
    )
    
    # Create span processor
    span_processor = BatchSpanProcessor(
        jaeger_exporter,
        max_queue_size=2048,
        max_export_batch_size=512,
        export_timeout_millis=30000,
    )
    
    # Add processor to tracer provider
    trace.get_tracer_provider().add_span_processor(span_processor)
    
    return trace.get_tracer(__name__)

# Sampling configuration
TRACING_CONFIG = {
    'SAMPLING_RATE': float(os.getenv('TRACING_SAMPLING_RATE', '0.1')),  # 10% sampling
    'MAX_SPANS_PER_TRACE': int(os.getenv('MAX_SPANS_PER_TRACE', '100')),
    'TRACE_TIMEOUT': int(os.getenv('TRACE_TIMEOUT', '30')),  # seconds
}

# Custom span decorator
def trace_operation(operation_name: str, **attributes):
    """Decorator for tracing operations."""
    def decorator(func):
        def wrapper(*args, **kwargs):
            tracer = trace.get_tracer(__name__)
            with tracer.start_as_current_span(operation_name) as span:
                # Add custom attributes
                for key, value in attributes.items():
                    span.set_attribute(key, value)
                
                try:
                    result = func(*args, **kwargs)
                    span.set_status(trace.Status(trace.StatusCode.OK))
                    return result
                except Exception as e:
                    span.record_exception(e)
                    span.set_status(trace.Status(trace.StatusCode.ERROR, str(e)))
                    raise
        return wrapper
    return decorator

# Usage example:
# @trace_operation("database_query", db_type="postgresql")
# def query_database(query):
#     # Database operation
#     pass
"""

# Write production config
with open("production_tracing.py", "w") as f:
    f.write(production_tracing_config)

print("Production tracing configuration created: production_tracing.py")
print("\nKey Features:")
print("- Jaeger integration for trace collection")
print("- Configurable sampling rates")
print("- Resource attribution (service name, version)")
print("- Custom span decorator for easy instrumentation")
print("- Batch processing for performance")
print("- Environment-based configuration")

## Conclusion

Distributed tracing provides deep insights into LLM service performance:

1. **Request Flow Visibility**: See the complete journey of each request
2. **Performance Bottlenecks**: Identify slow operations and optimize them
3. **Error Context**: Understand where and why errors occur
4. **Dependency Analysis**: Track interactions between services

**Key Benefits for LLM Services**:
- Understand token generation performance
- Identify model loading bottlenecks
- Track attention computation time
- Analyze end-to-end request latency

**Best Practices**:
- Use appropriate sampling rates (1-10% in production)
- Include relevant business context in spans
- Set up proper span relationships (parent-child)
- Monitor trace collection performance impact

Next, we'll explore GPU monitoring for LLM services.