# Day 33: Autoscaling for LLM Services - Part 5

Autoscaling is crucial for cost-effective LLM deployment. This notebook covers strategies for automatically scaling LLM services based on demand.

## Overview
1. Autoscaling metrics and triggers
2. Simple autoscaling implementation
3. Production considerations

In [None]:
import time
import threading
import numpy as np
import matplotlib.pyplot as plt
from collections import deque
from dataclasses import dataclass
from typing import List, Dict, Optional

## 1. Autoscaling Metrics

Key metrics for LLM autoscaling decisions.

In [None]:
@dataclass
class ScalingMetrics:
    queue_length: int
    avg_response_time: float
    requests_per_second: float
    gpu_utilization: float
    memory_usage: float
    active_instances: int

class MetricsCollector:
    def __init__(self, window_size=60):
        self.window_size = window_size
        self.metrics_history = deque(maxlen=window_size)
        self.request_times = deque(maxlen=100)
        self.current_queue_length = 0
        self.active_instances = 1
    
    def record_request(self, response_time: float):
        """Record a completed request."""
        self.request_times.append(response_time)
    
    def update_queue_length(self, length: int):
        """Update current queue length."""
        self.current_queue_length = length
    
    def get_current_metrics(self) -> ScalingMetrics:
        """Get current scaling metrics."""
        # Calculate average response time
        avg_response_time = np.mean(self.request_times) if self.request_times else 0
        
        # Calculate requests per second (last 10 seconds)
        recent_requests = len([t for t in self.request_times if time.time() - t < 10])
        requests_per_second = recent_requests / 10.0
        
        # Mock GPU metrics (in production, use nvidia-ml-py)
        gpu_utilization = np.random.uniform(40, 95)  # Mock GPU usage
        memory_usage = np.random.uniform(60, 90)     # Mock memory usage
        
        metrics = ScalingMetrics(
            queue_length=self.current_queue_length,
            avg_response_time=avg_response_time,
            requests_per_second=requests_per_second,
            gpu_utilization=gpu_utilization,
            memory_usage=memory_usage,
            active_instances=self.active_instances
        )
        
        self.metrics_history.append(metrics)
        return metrics

# Initialize metrics collector
metrics_collector = MetricsCollector()
print("Metrics collector initialized")

## 2. Simple Autoscaler Implementation

In [None]:
class SimpleAutoscaler:
    def __init__(self, min_instances=1, max_instances=10):
        self.min_instances = min_instances
        self.max_instances = max_instances
        self.current_instances = min_instances
        self.last_scale_time = time.time()
        self.cooldown_period = 300  # 5 minutes
        
        # Scaling thresholds
        self.scale_up_thresholds = {
            'queue_length': 10,
            'avg_response_time': 5.0,
            'gpu_utilization': 85.0
        }
        
        self.scale_down_thresholds = {
            'queue_length': 2,
            'avg_response_time': 1.0,
            'gpu_utilization': 30.0
        }
    
    def should_scale_up(self, metrics: ScalingMetrics) -> bool:
        """Determine if we should scale up."""
        conditions = [
            metrics.queue_length > self.scale_up_thresholds['queue_length'],
            metrics.avg_response_time > self.scale_up_thresholds['avg_response_time'],
            metrics.gpu_utilization > self.scale_up_thresholds['gpu_utilization']
        ]
        
        # Scale up if any condition is met
        return any(conditions) and self.current_instances < self.max_instances
    
    def should_scale_down(self, metrics: ScalingMetrics) -> bool:
        """Determine if we should scale down."""
        conditions = [
            metrics.queue_length < self.scale_down_thresholds['queue_length'],
            metrics.avg_response_time < self.scale_down_thresholds['avg_response_time'],
            metrics.gpu_utilization < self.scale_down_thresholds['gpu_utilization']
        ]
        
        # Scale down only if all conditions are met
        return all(conditions) and self.current_instances > self.min_instances
    
    def can_scale(self) -> bool:
        """Check if we're outside the cooldown period."""
        return time.time() - self.last_scale_time > self.cooldown_period
    
    def scale_up(self) -> bool:
        """Scale up by one instance."""
        if self.current_instances < self.max_instances:
            self.current_instances += 1
            self.last_scale_time = time.time()
            print(f"🔼 Scaled UP to {self.current_instances} instances")
            return True
        return False
    
    def scale_down(self) -> bool:
        """Scale down by one instance."""
        if self.current_instances > self.min_instances:
            self.current_instances -= 1
            self.last_scale_time = time.time()
            print(f"🔽 Scaled DOWN to {self.current_instances} instances")
            return True
        return False
    
    def evaluate(self, metrics: ScalingMetrics) -> Optional[str]:
        """Evaluate metrics and make scaling decision."""
        if not self.can_scale():
            return "cooldown"
        
        if self.should_scale_up(metrics):
            if self.scale_up():
                return "scaled_up"
        elif self.should_scale_down(metrics):
            if self.scale_down():
                return "scaled_down"
        
        return "no_action"

# Initialize autoscaler
autoscaler = SimpleAutoscaler(min_instances=1, max_instances=5)
print(f"Autoscaler initialized: {autoscaler.min_instances}-{autoscaler.max_instances} instances")

## 3. Autoscaling Simulation

In [None]:
def simulate_workload_pattern(duration_minutes=30):
    """Simulate a realistic workload pattern."""
    
    print(f"Simulating {duration_minutes} minutes of workload...")
    
    # Simulation data storage
    timeline = []
    metrics_history = []
    scaling_events = []
    
    start_time = time.time()
    
    for minute in range(duration_minutes):
        current_time = start_time + minute * 60
        
        # Simulate different load patterns
        if minute < 5:  # Low load
            base_load = 2
        elif minute < 15:  # Increasing load
            base_load = 2 + (minute - 5) * 2
        elif minute < 20:  # High load
            base_load = 20
        else:  # Decreasing load
            base_load = max(2, 20 - (minute - 20) * 3)
        
        # Add some randomness
        queue_length = max(0, int(base_load + np.random.normal(0, 2)))
        
        # Update metrics
        metrics_collector.update_queue_length(queue_length)
        metrics_collector.active_instances = autoscaler.current_instances
        
        # Simulate some completed requests
        for _ in range(np.random.randint(1, 5)):
            response_time = np.random.uniform(0.5, 3.0)
            metrics_collector.record_request(response_time)
        
        # Get current metrics
        current_metrics = metrics_collector.get_current_metrics()
        
        # Make scaling decision
        scaling_action = autoscaler.evaluate(current_metrics)
        
        # Record data
        timeline.append(minute)
        metrics_history.append(current_metrics)
        
        if scaling_action in ['scaled_up', 'scaled_down']:
            scaling_events.append((minute, scaling_action, autoscaler.current_instances))
        
        # Print status every 5 minutes
        if minute % 5 == 0:
            print(f"Minute {minute}: Queue={queue_length}, Instances={autoscaler.current_instances}, Action={scaling_action}")
        
        # Small delay for simulation
        time.sleep(0.1)
    
    return timeline, metrics_history, scaling_events

# Run simulation
timeline, metrics_history, scaling_events = simulate_workload_pattern(20)

print(f"\nSimulation completed. Scaling events: {len(scaling_events)}")
for event in scaling_events:
    minute, action, instances = event
    print(f"  Minute {minute}: {action} -> {instances} instances")

## 4. Visualizing Autoscaling Behavior

In [None]:
# Extract data for plotting
queue_lengths = [m.queue_length for m in metrics_history]
instance_counts = [m.active_instances for m in metrics_history]
gpu_utilizations = [m.gpu_utilization for m in metrics_history]
response_times = [m.avg_response_time for m in metrics_history]

# Create visualization
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))

# Queue length and instances
ax1.plot(timeline, queue_lengths, 'b-', label='Queue Length', linewidth=2)
ax1_twin = ax1.twinx()
ax1_twin.plot(timeline, instance_counts, 'r-', label='Instances', linewidth=2, marker='o')
ax1.set_xlabel('Time (minutes)')
ax1.set_ylabel('Queue Length', color='b')
ax1_twin.set_ylabel('Active Instances', color='r')
ax1.set_title('Queue Length vs Active Instances')
ax1.grid(True, alpha=0.3)

# GPU utilization
ax2.plot(timeline, gpu_utilizations, 'g-', linewidth=2)
ax2.axhline(y=85, color='r', linestyle='--', alpha=0.7, label='Scale Up Threshold')
ax2.axhline(y=30, color='b', linestyle='--', alpha=0.7, label='Scale Down Threshold')
ax2.set_xlabel('Time (minutes)')
ax2.set_ylabel('GPU Utilization (%)')
ax2.set_title('GPU Utilization Over Time')
ax2.legend()
ax2.grid(True, alpha=0.3)

# Response times
ax3.plot(timeline, response_times, 'm-', linewidth=2)
ax3.axhline(y=5.0, color='r', linestyle='--', alpha=0.7, label='Scale Up Threshold')
ax3.axhline(y=1.0, color='b', linestyle='--', alpha=0.7, label='Scale Down Threshold')
ax3.set_xlabel('Time (minutes)')
ax3.set_ylabel('Avg Response Time (s)')
ax3.set_title('Average Response Time')
ax3.legend()
ax3.grid(True, alpha=0.3)

# Scaling events
ax4.plot(timeline, instance_counts, 'r-', linewidth=2, marker='o')
for event in scaling_events:
    minute, action, instances = event
    color = 'green' if action == 'scaled_up' else 'orange'
    ax4.axvline(x=minute, color=color, linestyle=':', alpha=0.7)
    ax4.annotate(f'{action}\n({instances})', xy=(minute, instances), 
                xytext=(5, 5), textcoords='offset points', fontsize=8)

ax4.set_xlabel('Time (minutes)')
ax4.set_ylabel('Active Instances')
ax4.set_title('Scaling Events Timeline')
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print summary statistics
print("\n=== Autoscaling Summary ===")
print(f"Total scaling events: {len(scaling_events)}")
print(f"Max instances used: {max(instance_counts)}")
print(f"Min instances used: {min(instance_counts)}")
print(f"Average queue length: {np.mean(queue_lengths):.1f}")
print(f"Average response time: {np.mean(response_times):.2f}s")
print(f"Average GPU utilization: {np.mean(gpu_utilizations):.1f}%")

## 5. Production Autoscaling Script

Here's a production-ready autoscaling script template.

In [None]:
# Production autoscaling script
production_autoscaler = """
#!/usr/bin/env python3

# Production LLM Autoscaler
# This script monitors LLM service metrics and scales instances accordingly

import time
import logging
import requests
import subprocess
from dataclasses import dataclass
from typing import Dict, List

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

@dataclass
class AutoscalerConfig:
    min_instances: int = 1
    max_instances: int = 10
    scale_up_threshold_queue: int = 10
    scale_down_threshold_queue: int = 2
    scale_up_threshold_latency: float = 5.0
    scale_down_threshold_latency: float = 1.0
    cooldown_period: int = 300  # 5 minutes
    check_interval: int = 60    # 1 minute
    metrics_endpoint: str = "http://localhost:8000/metrics"

class ProductionAutoscaler:
    def __init__(self, config: AutoscalerConfig):
        self.config = config
        self.current_instances = config.min_instances
        self.last_scale_time = 0
        
    def get_metrics(self) -> Dict:
        """Fetch metrics from the LLM service."""
        try:
            response = requests.get(self.config.metrics_endpoint, timeout=10)
            return response.json()
        except Exception as e:
            logger.error(f"Failed to fetch metrics: {e}")
            return {}
    
    def scale_up(self) -> bool:
        """Scale up the service."""
        if self.current_instances >= self.config.max_instances:
            logger.warning("Already at maximum instances")
            return False
        
        try:
            # Example: Kubernetes scaling command
            cmd = f"kubectl scale deployment llm-service --replicas={self.current_instances + 1}"
            subprocess.run(cmd.split(), check=True)
            
            self.current_instances += 1
            self.last_scale_time = time.time()
            logger.info(f"Scaled UP to {self.current_instances} instances")
            return True
        except Exception as e:
            logger.error(f"Failed to scale up: {e}")
            return False
    
    def scale_down(self) -> bool:
        """Scale down the service."""
        if self.current_instances <= self.config.min_instances:
            logger.warning("Already at minimum instances")
            return False
        
        try:
            # Example: Kubernetes scaling command
            cmd = f"kubectl scale deployment llm-service --replicas={self.current_instances - 1}"
            subprocess.run(cmd.split(), check=True)
            
            self.current_instances -= 1
            self.last_scale_time = time.time()
            logger.info(f"Scaled DOWN to {self.current_instances} instances")
            return True
        except Exception as e:
            logger.error(f"Failed to scale down: {e}")
            return False
    
    def should_scale(self, metrics: Dict) -> str:
        """Determine scaling action based on metrics."""
        if time.time() - self.last_scale_time < self.config.cooldown_period:
            return "cooldown"
        
        queue_length = metrics.get("queue_length", 0)
        avg_latency = metrics.get("avg_response_time", 0)
        
        # Scale up conditions
        if (queue_length > self.config.scale_up_threshold_queue or 
            avg_latency > self.config.scale_up_threshold_latency):
            return "scale_up"
        
        # Scale down conditions
        if (queue_length < self.config.scale_down_threshold_queue and 
            avg_latency < self.config.scale_down_threshold_latency):
            return "scale_down"
        
        return "no_action"
    
    def run(self):
        """Main autoscaler loop."""
        logger.info("Starting LLM autoscaler...")
        
        while True:
            try:
                # Get current metrics
                metrics = self.get_metrics()
                
                if not metrics:
                    logger.warning("No metrics available, skipping scaling decision")
                    time.sleep(self.config.check_interval)
                    continue
                
                # Make scaling decision
                action = self.should_scale(metrics)
                
                if action == "scale_up":
                    self.scale_up()
                elif action == "scale_down":
                    self.scale_down()
                else:
                    logger.debug(f"No scaling action needed: {action}")
                
                # Log current status
                logger.info(f"Status: {self.current_instances} instances, "
                          f"Queue: {metrics.get('queue_length', 0)}, "
                          f"Latency: {metrics.get('avg_response_time', 0):.2f}s")
                
            except KeyboardInterrupt:
                logger.info("Autoscaler stopped by user")
                break
            except Exception as e:
                logger.error(f"Error in autoscaler loop: {e}")
            
            time.sleep(self.config.check_interval)

if __name__ == "__main__":
    config = AutoscalerConfig()
    autoscaler = ProductionAutoscaler(config)
    autoscaler.run()
"""

# Write production autoscaler
with open("production_autoscaler.py", "w") as f:
    f.write(production_autoscaler)

print("Production autoscaler created: production_autoscaler.py")
print("\nTo run: python production_autoscaler.py")
print("\nProduction Features:")
print("- Configurable thresholds and parameters")
print("- Kubernetes integration for scaling")
print("- Comprehensive logging")
print("- Error handling and recovery")
print("- Cooldown periods to prevent thrashing")

## Conclusion

Effective autoscaling for LLM services requires:

1. **Appropriate Metrics**: Queue length, response time, GPU utilization
2. **Smart Thresholds**: Balance responsiveness with stability
3. **Cooldown Periods**: Prevent scaling thrashing
4. **Production Integration**: Kubernetes, monitoring, logging

Key considerations:
- LLM cold start times are significant
- GPU resources are expensive and granular
- Request processing times vary widely
- Cost optimization is crucial

This completes our exploration of LLM serving stacks and deployment strategies.