# Day 34: Multi-Tenancy Architecture - Part 3

Implementing multi-tenant LLM systems with resource isolation and fair allocation.

## Overview
1. Tenant resource management
2. Isolation strategies
3. Cost allocation
4. Performance monitoring

In [None]:
import time
import threading
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict, deque
from dataclasses import dataclass
from enum import Enum

## 1. Tenant Configuration and Tiers

In [None]:
class TenantTier(Enum):
    BASIC = "basic"
    STANDARD = "standard"
    PREMIUM = "premium"
    ENTERPRISE = "enterprise"

@dataclass
class TenantConfig:
    tenant_id: str
    tier: TenantTier
    max_requests_per_minute: int
    max_tokens_per_minute: int
    max_concurrent_requests: int
    priority_weight: float
    cost_per_token: float
    
# Define tier configurations
TIER_CONFIGS = {
    TenantTier.BASIC: {
        'max_requests_per_minute': 60,
        'max_tokens_per_minute': 5000,
        'max_concurrent_requests': 2,
        'priority_weight': 0.5,
        'cost_per_token': 0.002
    },
    TenantTier.STANDARD: {
        'max_requests_per_minute': 300,
        'max_tokens_per_minute': 25000,
        'max_concurrent_requests': 5,
        'priority_weight': 1.0,
        'cost_per_token': 0.0015
    },
    TenantTier.PREMIUM: {
        'max_requests_per_minute': 1000,
        'max_tokens_per_minute': 100000,
        'max_concurrent_requests': 15,
        'priority_weight': 2.0,
        'cost_per_token': 0.001
    },
    TenantTier.ENTERPRISE: {
        'max_requests_per_minute': 5000,
        'max_tokens_per_minute': 500000,
        'max_concurrent_requests': 50,
        'priority_weight': 3.0,
        'cost_per_token': 0.0008
    }
}

def create_tenant_config(tenant_id: str, tier: TenantTier) -> TenantConfig:
    """Create tenant configuration based on tier."""
    config = TIER_CONFIGS[tier]
    return TenantConfig(
        tenant_id=tenant_id,
        tier=tier,
        **config
    )

# Create sample tenants
tenants = {
    'startup_a': create_tenant_config('startup_a', TenantTier.BASIC),
    'company_b': create_tenant_config('company_b', TenantTier.STANDARD),
    'corp_c': create_tenant_config('corp_c', TenantTier.PREMIUM),
    'enterprise_d': create_tenant_config('enterprise_d', TenantTier.ENTERPRISE)
}

print("Tenant Configurations:")
for tenant_id, config in tenants.items():
    print(f"  {tenant_id} ({config.tier.value}): {config.max_requests_per_minute} req/min, "
          f"{config.max_tokens_per_minute} tokens/min, weight: {config.priority_weight}")

## 2. Multi-Tenant Resource Manager

In [None]:
class MultiTenantResourceManager:
    def __init__(self):
        self.tenants = {}
        self.usage_tracking = defaultdict(lambda: {
            'requests': deque(),
            'tokens': deque(),
            'concurrent': 0,
            'total_cost': 0
        })
        self.lock = threading.Lock()
    
    def register_tenant(self, config: TenantConfig):
        """Register a new tenant."""
        with self.lock:
            self.tenants[config.tenant_id] = config
    
    def _cleanup_usage_history(self, tenant_id: str, window_seconds: int = 60):
        """Clean up old usage records."""
        now = time.time()
        cutoff = now - window_seconds
        
        usage = self.usage_tracking[tenant_id]
        
        # Clean requests
        while usage['requests'] and usage['requests'][0] < cutoff:
            usage['requests'].popleft()
        
        # Clean tokens
        while usage['tokens'] and usage['tokens'][0]['timestamp'] < cutoff:
            usage['tokens'].popleft()
    
    def check_request_allowed(self, tenant_id: str, estimated_tokens: int = 0) -> tuple[bool, str]:
        """Check if request is allowed for tenant."""
        with self.lock:
            if tenant_id not in self.tenants:
                return False, "Tenant not registered"
            
            config = self.tenants[tenant_id]
            usage = self.usage_tracking[tenant_id]
            
            # Clean up old records
            self._cleanup_usage_history(tenant_id)
            
            # Check concurrent requests
            if usage['concurrent'] >= config.max_concurrent_requests:
                return False, "Concurrent request limit exceeded"
            
            # Check requests per minute
            if len(usage['requests']) >= config.max_requests_per_minute:
                return False, "Request rate limit exceeded"
            
            # Check tokens per minute
            current_tokens = sum(entry['tokens'] for entry in usage['tokens'])
            if current_tokens + estimated_tokens > config.max_tokens_per_minute:
                return False, "Token rate limit exceeded"
            
            return True, "Request allowed"
    
    def start_request(self, tenant_id: str) -> bool:
        """Mark request as started."""
        with self.lock:
            usage = self.usage_tracking[tenant_id]
            usage['concurrent'] += 1
            usage['requests'].append(time.time())
            return True
    
    def complete_request(self, tenant_id: str, tokens_used: int):
        """Mark request as completed and record usage."""
        with self.lock:
            config = self.tenants[tenant_id]
            usage = self.usage_tracking[tenant_id]
            
            # Decrease concurrent count
            usage['concurrent'] = max(0, usage['concurrent'] - 1)
            
            # Record token usage
            usage['tokens'].append({
                'timestamp': time.time(),
                'tokens': tokens_used
            })
            
            # Calculate and record cost
            cost = tokens_used * config.cost_per_token
            usage['total_cost'] += cost
            
            return cost
    
    def get_tenant_stats(self, tenant_id: str) -> dict:
        """Get current usage statistics for tenant."""
        with self.lock:
            if tenant_id not in self.tenants:
                return {}
            
            config = self.tenants[tenant_id]
            usage = self.usage_tracking[tenant_id]
            
            self._cleanup_usage_history(tenant_id)
            
            current_tokens = sum(entry['tokens'] for entry in usage['tokens'])
            
            return {
                'tenant_id': tenant_id,
                'tier': config.tier.value,
                'requests_last_minute': len(usage['requests']),
                'max_requests_per_minute': config.max_requests_per_minute,
                'tokens_last_minute': current_tokens,
                'max_tokens_per_minute': config.max_tokens_per_minute,
                'concurrent_requests': usage['concurrent'],
                'max_concurrent_requests': config.max_concurrent_requests,
                'total_cost': usage['total_cost'],
                'priority_weight': config.priority_weight
            }
    
    def get_all_stats(self) -> dict:
        """Get statistics for all tenants."""
        return {tenant_id: self.get_tenant_stats(tenant_id) 
                for tenant_id in self.tenants.keys()}

# Test the resource manager
resource_manager = MultiTenantResourceManager()

# Register tenants
for tenant_id, config in tenants.items():
    resource_manager.register_tenant(config)

print("\nResource Manager initialized with tenants.")

## 3. Simulating Multi-Tenant Workload

In [None]:
def simulate_multi_tenant_workload(resource_manager, duration_seconds=30):
    """Simulate workload from multiple tenants."""
    
    results = defaultdict(lambda: {
        'requests_attempted': 0,
        'requests_accepted': 0,
        'requests_rejected': 0,
        'total_tokens': 0,
        'total_cost': 0,
        'rejection_reasons': defaultdict(int)
    })
    
    start_time = time.time()
    
    # Different request patterns for different tiers
    tenant_patterns = {
        'startup_a': {'rate': 0.5, 'token_range': (50, 200)},    # Slow, small requests
        'company_b': {'rate': 2.0, 'token_range': (100, 500)},   # Medium rate
        'corp_c': {'rate': 5.0, 'token_range': (200, 800)},      # High rate
        'enterprise_d': {'rate': 10.0, 'token_range': (500, 1500)} # Very high rate
    }
    
    def worker_thread(tenant_id, pattern):
        """Worker thread for each tenant."""
        last_request_time = start_time
        
        while time.time() - start_time < duration_seconds:
            # Calculate when to send next request
            interval = 1.0 / pattern['rate']
            next_request_time = last_request_time + interval
            
            # Wait until it's time for next request
            sleep_time = next_request_time - time.time()
            if sleep_time > 0:
                time.sleep(sleep_time)
            
            # Generate request
            tokens = np.random.randint(*pattern['token_range'])
            
            results[tenant_id]['requests_attempted'] += 1
            
            # Check if request is allowed
            allowed, reason = resource_manager.check_request_allowed(tenant_id, tokens)
            
            if allowed:
                # Start request
                resource_manager.start_request(tenant_id)
                results[tenant_id]['requests_accepted'] += 1
                
                # Simulate processing time (proportional to tokens)
                processing_time = tokens * 0.001  # 1ms per token
                time.sleep(processing_time)
                
                # Complete request
                cost = resource_manager.complete_request(tenant_id, tokens)
                results[tenant_id]['total_tokens'] += tokens
                results[tenant_id]['total_cost'] += cost
            else:
                results[tenant_id]['requests_rejected'] += 1
                results[tenant_id]['rejection_reasons'][reason] += 1
            
            last_request_time = time.time()
    
    # Start worker threads for each tenant
    threads = []
    for tenant_id, pattern in tenant_patterns.items():
        thread = threading.Thread(target=worker_thread, args=(tenant_id, pattern))
        thread.daemon = True
        thread.start()
        threads.append(thread)
    
    # Wait for simulation to complete
    for thread in threads:
        thread.join()
    
    return dict(results)

# Run simulation
print("Running multi-tenant simulation for 20 seconds...")
simulation_results = simulate_multi_tenant_workload(resource_manager, duration_seconds=20)

# Display results
print("\n=== Simulation Results ===")
for tenant_id, results in simulation_results.items():
    acceptance_rate = results['requests_accepted'] / results['requests_attempted'] * 100
    avg_tokens = results['total_tokens'] / results['requests_accepted'] if results['requests_accepted'] > 0 else 0
    
    print(f"\n{tenant_id} ({tenants[tenant_id].tier.value}):")
    print(f"  Attempted: {results['requests_attempted']}")
    print(f"  Accepted: {results['requests_accepted']} ({acceptance_rate:.1f}%)")
    print(f"  Rejected: {results['requests_rejected']}")
    print(f"  Avg tokens/request: {avg_tokens:.0f}")
    print(f"  Total cost: ${results['total_cost']:.3f}")
    
    if results['rejection_reasons']:
        print(f"  Rejection reasons:")
        for reason, count in results['rejection_reasons'].items():
            print(f"    {reason}: {count}")

## 4. Resource Utilization Analysis

In [None]:
# Get final tenant statistics
final_stats = resource_manager.get_all_stats()

# Prepare data for visualization
tenant_names = list(final_stats.keys())
tiers = [final_stats[t]['tier'] for t in tenant_names]
acceptance_rates = []
cost_per_tenant = []
utilization_rates = []

for tenant_id in tenant_names:
    sim_result = simulation_results[tenant_id]
    stats = final_stats[tenant_id]
    
    # Calculate acceptance rate
    if sim_result['requests_attempted'] > 0:
        acceptance_rate = sim_result['requests_accepted'] / sim_result['requests_attempted']
    else:
        acceptance_rate = 0
    acceptance_rates.append(acceptance_rate * 100)
    
    # Get cost
    cost_per_tenant.append(sim_result['total_cost'])
    
    # Calculate utilization (tokens used vs limit)
    if stats['max_tokens_per_minute'] > 0:
        utilization = stats['tokens_last_minute'] / stats['max_tokens_per_minute']
    else:
        utilization = 0
    utilization_rates.append(utilization * 100)

# Create visualizations
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))

# 1. Acceptance rates by tier
colors = ['red', 'orange', 'green', 'blue']
ax1.bar(range(len(tenant_names)), acceptance_rates, color=colors)
ax1.set_xlabel('Tenants')
ax1.set_ylabel('Acceptance Rate (%)')
ax1.set_title('Request Acceptance Rate by Tenant')
ax1.set_xticks(range(len(tenant_names)))
ax1.set_xticklabels([f"{name}\n({tier})" for name, tier in zip(tenant_names, tiers)], rotation=45)
ax1.grid(axis='y', alpha=0.3)

# 2. Cost distribution
ax2.bar(range(len(tenant_names)), cost_per_tenant, color=colors)
ax2.set_xlabel('Tenants')
ax2.set_ylabel('Total Cost ($)')
ax2.set_title('Cost by Tenant')
ax2.set_xticks(range(len(tenant_names)))
ax2.set_xticklabels([f"{name}\n({tier})" for name, tier in zip(tenant_names, tiers)], rotation=45)
ax2.grid(axis='y', alpha=0.3)

# 3. Resource utilization
ax3.bar(range(len(tenant_names)), utilization_rates, color=colors)
ax3.set_xlabel('Tenants')
ax3.set_ylabel('Token Utilization (%)')
ax3.set_title('Resource Utilization by Tenant')
ax3.set_xticks(range(len(tenant_names)))
ax3.set_xticklabels([f"{name}\n({tier})" for name, tier in zip(tenant_names, tiers)], rotation=45)
ax3.grid(axis='y', alpha=0.3)

# 4. Requests attempted vs accepted
attempted = [simulation_results[t]['requests_attempted'] for t in tenant_names]
accepted = [simulation_results[t]['requests_accepted'] for t in tenant_names]

x = np.arange(len(tenant_names))
width = 0.35

ax4.bar(x - width/2, attempted, width, label='Attempted', alpha=0.7)
ax4.bar(x + width/2, accepted, width, label='Accepted', alpha=0.7)
ax4.set_xlabel('Tenants')
ax4.set_ylabel('Number of Requests')
ax4.set_title('Requests Attempted vs Accepted')
ax4.set_xticks(x)
ax4.set_xticklabels([f"{name}\n({tier})" for name, tier in zip(tenant_names, tiers)], rotation=45)
ax4.legend()
ax4.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

# Print fairness analysis
print("\n=== Fairness Analysis ===")
total_accepted = sum(simulation_results[t]['requests_accepted'] for t in tenant_names)
total_cost = sum(simulation_results[t]['total_cost'] for t in tenant_names)

for tenant_id in tenant_names:
    config = tenants[tenant_id]
    sim_result = simulation_results[tenant_id]
    
    request_share = sim_result['requests_accepted'] / total_accepted * 100 if total_accepted > 0 else 0
    cost_share = sim_result['total_cost'] / total_cost * 100 if total_cost > 0 else 0
    
    print(f"{tenant_id} ({config.tier.value}):")
    print(f"  Weight: {config.priority_weight}, Request share: {request_share:.1f}%, Cost share: {cost_share:.1f}%")

## Conclusion

Multi-tenancy in LLM systems requires careful resource management:

1. **Tier-based Limits**: Different service levels with appropriate resource allocations
2. **Resource Isolation**: Prevents noisy neighbor problems
3. **Fair Allocation**: Weighted resource sharing based on tenant priority
4. **Cost Tracking**: Per-tenant usage and billing

The simulation shows how different tiers get proportional access to resources while maintaining isolation and fairness.