# Day 34: Rate Limiting Strategies - Part 2

Implementing rate limiting algorithms to control resource usage and ensure fair access in LLM serving systems.

## Overview
1. Token bucket algorithm
2. Sliding window rate limiter
3. Multi-dimensional rate limiting
4. Fair queuing implementation

In [None]:
import time
import threading
import numpy as np
import matplotlib.pyplot as plt
from collections import deque, defaultdict
from datetime import datetime, timedelta

## 1. Token Bucket Rate Limiter

In [None]:
class TokenBucket:
    def __init__(self, capacity, refill_rate):
        """
        Token bucket rate limiter.
        
        Args:
            capacity: Maximum number of tokens
            refill_rate: Tokens added per second
        """
        self.capacity = capacity
        self.tokens = capacity
        self.refill_rate = refill_rate
        self.last_refill = time.time()
        self.lock = threading.Lock()
    
    def _refill(self):
        """Refill tokens based on elapsed time."""
        now = time.time()
        elapsed = now - self.last_refill
        tokens_to_add = elapsed * self.refill_rate
        
        self.tokens = min(self.capacity, self.tokens + tokens_to_add)
        self.last_refill = now
    
    def consume(self, tokens=1):
        """Try to consume tokens. Returns True if successful."""
        with self.lock:
            self._refill()
            
            if self.tokens >= tokens:
                self.tokens -= tokens
                return True
            return False
    
    def get_tokens(self):
        """Get current token count."""
        with self.lock:
            self._refill()
            return self.tokens

# Test token bucket
bucket = TokenBucket(capacity=10, refill_rate=2)  # 10 tokens max, 2 per second

print("Testing Token Bucket:")
print(f"Initial tokens: {bucket.get_tokens():.1f}")

# Consume tokens quickly
for i in range(15):
    success = bucket.consume(1)
    print(f"Request {i+1}: {'✓' if success else '✗'} (tokens: {bucket.get_tokens():.1f})")
    time.sleep(0.1)

print(f"\nAfter 1.5 seconds: {bucket.get_tokens():.1f} tokens")
time.sleep(1.5)
print(f"After refill: {bucket.get_tokens():.1f} tokens")

## 2. Sliding Window Rate Limiter

In [None]:
class SlidingWindowRateLimiter:
    def __init__(self, max_requests, window_size_seconds):
        """
        Sliding window rate limiter.
        
        Args:
            max_requests: Maximum requests in window
            window_size_seconds: Window size in seconds
        """
        self.max_requests = max_requests
        self.window_size = window_size_seconds
        self.requests = deque()
        self.lock = threading.Lock()
    
    def _cleanup_old_requests(self):
        """Remove requests outside the current window."""
        now = time.time()
        cutoff = now - self.window_size
        
        while self.requests and self.requests[0] < cutoff:
            self.requests.popleft()
    
    def is_allowed(self):
        """Check if request is allowed."""
        with self.lock:
            self._cleanup_old_requests()
            
            if len(self.requests) < self.max_requests:
                self.requests.append(time.time())
                return True
            return False
    
    def get_current_count(self):
        """Get current request count in window."""
        with self.lock:
            self._cleanup_old_requests()
            return len(self.requests)

# Test sliding window
limiter = SlidingWindowRateLimiter(max_requests=5, window_size_seconds=2)

print("\nTesting Sliding Window Rate Limiter:")
print("Limit: 5 requests per 2 seconds")

for i in range(10):
    allowed = limiter.is_allowed()
    count = limiter.get_current_count()
    print(f"Request {i+1}: {'✓' if allowed else '✗'} (count: {count}/5)")
    time.sleep(0.3)

print(f"\nAfter window expires: {limiter.get_current_count()}/5")

## 3. Multi-Dimensional Rate Limiter for LLMs

In [None]:
class LLMRateLimiter:
    def __init__(self, limits):
        """
        Multi-dimensional rate limiter for LLM services.
        
        Args:
            limits: Dict with limit configurations
                   e.g., {
                       'requests_per_minute': 60,
                       'tokens_per_minute': 10000,
                       'cost_per_hour': 10.0
                   }
        """
        self.limits = limits
        self.counters = defaultdict(lambda: deque())
        self.lock = threading.Lock()
    
    def _cleanup_counter(self, counter_name, window_seconds):
        """Clean up old entries from a counter."""
        now = time.time()
        cutoff = now - window_seconds
        
        counter = self.counters[counter_name]
        while counter and counter[0]['timestamp'] < cutoff:
            counter.popleft()
    
    def check_limits(self, request_tokens=0, estimated_cost=0):
        """Check if request is within all limits."""
        with self.lock:
            now = time.time()
            
            # Check request rate limit
            if 'requests_per_minute' in self.limits:
                self._cleanup_counter('requests', 60)
                if len(self.counters['requests']) >= self.limits['requests_per_minute']:
                    return False, "Request rate limit exceeded"
            
            # Check token rate limit
            if 'tokens_per_minute' in self.limits:
                self._cleanup_counter('tokens', 60)
                current_tokens = sum(entry['value'] for entry in self.counters['tokens'])
                if current_tokens + request_tokens > self.limits['tokens_per_minute']:
                    return False, "Token rate limit exceeded"
            
            # Check cost limit
            if 'cost_per_hour' in self.limits:
                self._cleanup_counter('cost', 3600)
                current_cost = sum(entry['value'] for entry in self.counters['cost'])
                if current_cost + estimated_cost > self.limits['cost_per_hour']:
                    return False, "Cost limit exceeded"
            
            return True, "OK"
    
    def record_usage(self, tokens_used=0, actual_cost=0):
        """Record actual usage after request completion."""
        with self.lock:
            now = time.time()
            
            # Record request
            self.counters['requests'].append({
                'timestamp': now,
                'value': 1
            })
            
            # Record tokens
            if tokens_used > 0:
                self.counters['tokens'].append({
                    'timestamp': now,
                    'value': tokens_used
                })
            
            # Record cost
            if actual_cost > 0:
                self.counters['cost'].append({
                    'timestamp': now,
                    'value': actual_cost
                })
    
    def get_usage_stats(self):
        """Get current usage statistics."""
        with self.lock:
            # Clean up all counters
            self._cleanup_counter('requests', 60)
            self._cleanup_counter('tokens', 60)
            self._cleanup_counter('cost', 3600)
            
            return {
                'requests_last_minute': len(self.counters['requests']),
                'tokens_last_minute': sum(entry['value'] for entry in self.counters['tokens']),
                'cost_last_hour': sum(entry['value'] for entry in self.counters['cost'])
            }

# Test multi-dimensional rate limiter
llm_limiter = LLMRateLimiter({
    'requests_per_minute': 10,
    'tokens_per_minute': 1000,
    'cost_per_hour': 5.0
})

print("\nTesting Multi-Dimensional Rate Limiter:")
print("Limits: 10 req/min, 1000 tokens/min, $5/hour")

# Simulate requests
for i in range(15):
    tokens = np.random.randint(50, 200)
    cost = tokens * 0.001  # $0.001 per token
    
    allowed, reason = llm_limiter.check_limits(tokens, cost)
    
    if allowed:
        llm_limiter.record_usage(tokens, cost)
        print(f"Request {i+1}: ✓ ({tokens} tokens, ${cost:.3f})")
    else:
        print(f"Request {i+1}: ✗ ({reason})")
    
    time.sleep(0.1)

# Show final stats
stats = llm_limiter.get_usage_stats()
print(f"\nFinal usage:")
print(f"  Requests: {stats['requests_last_minute']}/10")
print(f"  Tokens: {stats['tokens_last_minute']}/1000")
print(f"  Cost: ${stats['cost_last_hour']:.3f}/$5.00")

## 4. Fair Queuing for Multi-Tenant Systems

In [None]:
class FairQueue:
    def __init__(self, max_queue_size=100):
        self.queues = defaultdict(lambda: deque())
        self.weights = defaultdict(lambda: 1.0)  # Default weight
        self.last_served = defaultdict(lambda: 0)
        self.max_queue_size = max_queue_size
        self.lock = threading.Lock()
        self.request_counter = 0
    
    def set_weight(self, tenant_id, weight):
        """Set weight for a tenant (higher weight = more resources)."""
        self.weights[tenant_id] = weight
    
    def enqueue(self, tenant_id, request):
        """Add request to tenant's queue."""
        with self.lock:
            if len(self.queues[tenant_id]) >= self.max_queue_size:
                return False, "Queue full"
            
            self.request_counter += 1
            request['id'] = self.request_counter
            request['enqueue_time'] = time.time()
            
            self.queues[tenant_id].append(request)
            return True, "Enqueued"
    
    def dequeue(self):
        """Dequeue next request using weighted fair queuing."""
        with self.lock:
            # Find tenant with highest priority (weight / last_served)
            best_tenant = None
            best_priority = -1
            
            for tenant_id, queue in self.queues.items():
                if queue:  # Non-empty queue
                    # Calculate priority (weight / (time since last served + 1))
                    time_since_served = time.time() - self.last_served[tenant_id]
                    priority = self.weights[tenant_id] / (time_since_served + 1)
                    
                    if priority > best_priority:
                        best_priority = priority
                        best_tenant = tenant_id
            
            if best_tenant:
                request = self.queues[best_tenant].popleft()
                self.last_served[best_tenant] = time.time()
                
                # Calculate wait time
                request['wait_time'] = time.time() - request['enqueue_time']
                request['tenant_id'] = best_tenant
                
                return request
            
            return None
    
    def get_queue_stats(self):
        """Get queue statistics."""
        with self.lock:
            stats = {}
            for tenant_id, queue in self.queues.items():
                stats[tenant_id] = {
                    'queue_length': len(queue),
                    'weight': self.weights[tenant_id],
                    'last_served': self.last_served[tenant_id]
                }
            return stats

# Test fair queuing
fair_queue = FairQueue()

# Set different weights for tenants
fair_queue.set_weight('premium', 3.0)    # Premium tenant gets 3x resources
fair_queue.set_weight('standard', 1.0)   # Standard tenant
fair_queue.set_weight('basic', 0.5)      # Basic tenant gets 0.5x resources

print("\nTesting Fair Queuing:")
print("Weights: Premium=3.0, Standard=1.0, Basic=0.5")

# Add requests from different tenants
tenants = ['premium', 'standard', 'basic']
for i in range(15):
    tenant = tenants[i % 3]
    request = {'data': f'Request {i+1} from {tenant}'}
    
    success, msg = fair_queue.enqueue(tenant, request)
    if success:
        print(f"  Enqueued: {request['data']}")

# Process requests
print("\nProcessing requests (fair queuing):")
processed_count = defaultdict(int)
total_wait_times = defaultdict(list)

while True:
    request = fair_queue.dequeue()
    if not request:
        break
    
    tenant = request['tenant_id']
    processed_count[tenant] += 1
    total_wait_times[tenant].append(request['wait_time'])
    
    print(f"  Processed: {request['data']} (wait: {request['wait_time']:.3f}s)")
    time.sleep(0.1)  # Simulate processing time

# Show fairness results
print("\nFairness Results:")
for tenant in tenants:
    count = processed_count[tenant]
    weight = fair_queue.weights[tenant]
    avg_wait = np.mean(total_wait_times[tenant]) if total_wait_times[tenant] else 0
    
    print(f"  {tenant.capitalize()}: {count} requests (weight: {weight}, avg wait: {avg_wait:.3f}s)")

# Calculate fairness ratio
standard_count = processed_count['standard']
if standard_count > 0:
    premium_ratio = processed_count['premium'] / standard_count
    basic_ratio = processed_count['basic'] / standard_count
    
    print(f"\nFairness Ratios (vs Standard):")
    print(f"  Premium: {premium_ratio:.2f} (expected: 3.0)")
    print(f"  Basic: {basic_ratio:.2f} (expected: 0.5)")

## 5. Rate Limiting Performance Analysis

In [None]:
def simulate_rate_limiting_scenarios():
    """Compare different rate limiting algorithms under load."""
    
    # Test parameters
    request_rates = [1, 2, 5, 10, 15, 20]  # requests per second
    test_duration = 10  # seconds
    
    results = {
        'token_bucket': [],
        'sliding_window': [],
        'no_limit': []
    }
    
    for rate in request_rates:
        print(f"Testing at {rate} req/s...")
        
        # Token bucket (10 capacity, 5 refill rate)
        bucket = TokenBucket(capacity=10, refill_rate=5)
        bucket_accepted = 0
        
        # Sliding window (50 requests per 10 seconds)
        window = SlidingWindowRateLimiter(max_requests=50, window_size_seconds=10)
        window_accepted = 0
        
        # No limit baseline
        no_limit_accepted = 0
        
        # Simulate requests
        start_time = time.time()
        request_interval = 1.0 / rate
        
        while time.time() - start_time < test_duration:
            # Token bucket test
            if bucket.consume(1):
                bucket_accepted += 1
            
            # Sliding window test
            if window.is_allowed():
                window_accepted += 1
            
            # No limit test
            no_limit_accepted += 1
            
            time.sleep(request_interval)
        
        results['token_bucket'].append(bucket_accepted / test_duration)
        results['sliding_window'].append(window_accepted / test_duration)
        results['no_limit'].append(no_limit_accepted / test_duration)
    
    return request_rates, results

# Run performance comparison
print("Running rate limiting performance analysis...")
rates, results = simulate_rate_limiting_scenarios()

# Plot results
plt.figure(figsize=(10, 6))
plt.plot(rates, results['no_limit'], 'k--', label='No Limit', alpha=0.7)
plt.plot(rates, results['token_bucket'], 'o-', label='Token Bucket (10 cap, 5/s refill)')
plt.plot(rates, results['sliding_window'], 's-', label='Sliding Window (50/10s)')

plt.xlabel('Request Rate (req/s)')
plt.ylabel('Accepted Rate (req/s)')
plt.title('Rate Limiting Algorithm Comparison')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Print summary
print("\nRate Limiting Effectiveness:")
for i, rate in enumerate(rates):
    tb_eff = results['token_bucket'][i] / results['no_limit'][i]
    sw_eff = results['sliding_window'][i] / results['no_limit'][i]
    print(f"  {rate} req/s: Token Bucket {tb_eff:.1%}, Sliding Window {sw_eff:.1%}")

## Conclusion

Rate limiting is essential for LLM serving systems:

1. **Token Bucket**: Good for handling bursts while maintaining average rate
2. **Sliding Window**: Provides precise rate control over time windows
3. **Multi-Dimensional**: Controls multiple resource types (requests, tokens, cost)
4. **Fair Queuing**: Ensures fair resource allocation across tenants

Choose the right algorithm based on your specific requirements for burst handling, precision, and fairness.