# Day 32: Priority-Based Scheduling - Part 5c

Priority-based scheduling allows different requests to be processed with different priorities, enabling SLA guarantees and fair resource allocation.

## Overview
1. Understanding priority scheduling
2. Implementation with priority queues
3. Measuring fairness and SLA compliance

In [None]:
import time
import heapq
import threading
import numpy as np
import matplotlib.pyplot as plt
from enum import Enum

## 1. Priority Levels and Request Classes

In [None]:
class Priority(Enum):
    HIGH = 1
    MEDIUM = 2
    LOW = 3

class PriorityRequest:
    def __init__(self, id, prompt, priority, max_tokens=20, sla_target=None):
        self.id = id
        self.prompt = prompt
        self.priority = priority
        self.max_tokens = max_tokens
        self.sla_target = sla_target  # Target completion time in seconds
        self.arrival_time = time.time()
        self.start_time = None
        self.completion_time = None
        self.generated_tokens = 0
        self.result = prompt
    
    def __lt__(self, other):
        # For heapq - lower priority value = higher priority
        if self.priority.value != other.priority.value:
            return self.priority.value < other.priority.value
        # If same priority, use arrival time (FIFO)
        return self.arrival_time < other.arrival_time
    
    @property
    def latency(self):
        if self.completion_time is None:
            return None
        return self.completion_time - self.arrival_time
    
    @property
    def sla_met(self):
        if self.sla_target is None or self.latency is None:
            return None
        return self.latency <= self.sla_target

## 2. Priority-Based Scheduler Implementation

In [None]:
class PriorityScheduler:
    def __init__(self, max_batch_size=8, token_generation_time=0.05):
        self.max_batch_size = max_batch_size
        self.token_generation_time = token_generation_time
        self.request_heap = []  # Priority queue
        self.active_requests = {}
        self.completed_requests = []
        self.running = False
        self.lock = threading.Lock()
    
    def submit_request(self, request):
        with self.lock:
            heapq.heappush(self.request_heap, request)
    
    def start(self):
        self.running = True
        self.thread = threading.Thread(target=self._process_with_priority)
        self.thread.daemon = True
        self.thread.start()
    
    def stop(self):
        self.running = False
        if hasattr(self, 'thread'):
            self.thread.join()
    
    def _process_with_priority(self):
        while self.running:
            # Add high-priority requests first
            self._add_requests_by_priority()
            
            # Process active requests
            if self.active_requests:
                self._generate_tokens()
                self._remove_completed()
            else:
                time.sleep(0.01)
    
    def _add_requests_by_priority(self):
        with self.lock:
            # Add requests up to batch size, prioritizing high-priority ones
            while (len(self.active_requests) < self.max_batch_size and 
                   self.request_heap):
                request = heapq.heappop(self.request_heap)
                request.start_time = time.time()
                self.active_requests[request.id] = request
    
    def _generate_tokens(self):
        batch_size = len(self.active_requests)
        efficiency = max(0.5, 1.0 - 0.1 * np.log(batch_size))
        time.sleep(self.token_generation_time * efficiency)
        
        # Update all active requests
        for request in self.active_requests.values():
            request.generated_tokens += 1
            request.result += " token"
    
    def _remove_completed(self):
        completed_ids = []
        for req_id, request in self.active_requests.items():
            if request.generated_tokens >= request.max_tokens:
                request.completion_time = time.time()
                self.completed_requests.append(request)
                completed_ids.append(req_id)
        
        for req_id in completed_ids:
            del self.active_requests[req_id]
    
    def get_metrics_by_priority(self):
        metrics = {}
        
        for priority in Priority:
            priority_requests = [r for r in self.completed_requests 
                               if r.priority == priority]
            
            if priority_requests:
                latencies = [r.latency for r in priority_requests]
                sla_compliance = [r.sla_met for r in priority_requests 
                                if r.sla_met is not None]
                
                metrics[priority.name] = {
                    'count': len(priority_requests),
                    'avg_latency': np.mean(latencies),
                    'max_latency': np.max(latencies),
                    'sla_compliance': np.mean(sla_compliance) if sla_compliance else None
                }
        
        return metrics

## 3. Testing Priority Scheduling

In [None]:
def test_priority_scheduling():
    scheduler = PriorityScheduler(max_batch_size=4)
    scheduler.start()
    
    # Submit mixed priority requests
    requests = [
        # High priority with tight SLA
        PriorityRequest(1, "Urgent: Stock price", Priority.HIGH, 10, sla_target=2.0),
        PriorityRequest(2, "Critical: System alert", Priority.HIGH, 8, sla_target=1.5),
        
        # Medium priority
        PriorityRequest(3, "Normal: User query", Priority.MEDIUM, 15, sla_target=5.0),
        PriorityRequest(4, "Standard: Report gen", Priority.MEDIUM, 12, sla_target=4.0),
        
        # Low priority
        PriorityRequest(5, "Batch: Data analysis", Priority.LOW, 20, sla_target=10.0),
        PriorityRequest(6, "Background: Summary", Priority.LOW, 18, sla_target=8.0),
    ]
    
    # Submit requests with some delay
    for i, request in enumerate(requests):
        scheduler.submit_request(request)
        time.sleep(0.2)  # Small delay between submissions
    
    # Wait for completion
    while len(scheduler.completed_requests) < len(requests):
        time.sleep(0.1)
    
    scheduler.stop()
    return scheduler

# Run test
scheduler = test_priority_scheduling()
metrics = scheduler.get_metrics_by_priority()

# Display results
for priority_name, data in metrics.items():
    print(f"\n{priority_name} Priority:")
    print(f"  Count: {data['count']}")
    print(f"  Avg Latency: {data['avg_latency']:.2f}s")
    print(f"  Max Latency: {data['max_latency']:.2f}s")
    if data['sla_compliance'] is not None:
        print(f"  SLA Compliance: {data['sla_compliance']:.1%}")

## 4. Comparing FIFO vs Priority Scheduling

In [None]:
def compare_fifo_vs_priority():
    # Create test requests
    test_requests = [
        # Mix of priorities arriving in non-optimal order
        PriorityRequest(1, "Low priority task", Priority.LOW, 20),
        PriorityRequest(2, "Another low task", Priority.LOW, 18),
        PriorityRequest(3, "HIGH PRIORITY!", Priority.HIGH, 5),
        PriorityRequest(4, "Medium task", Priority.MEDIUM, 12),
        PriorityRequest(5, "URGENT HIGH!", Priority.HIGH, 8),
        PriorityRequest(6, "Low priority again", Priority.LOW, 15),
    ]
    
    # Test FIFO (simple continuous batching)
    fifo_completion_order = []
    fifo_latencies = []
    
    # Simulate FIFO processing
    current_time = 0
    for i, request in enumerate(test_requests):
        # Processing time proportional to tokens
        processing_time = request.max_tokens * 0.1
        current_time += processing_time
        
        fifo_completion_order.append(request.priority.name)
        fifo_latencies.append(current_time)
    
    # Test Priority Scheduling
    priority_scheduler = PriorityScheduler(max_batch_size=1)  # Process one at a time for clarity
    priority_scheduler.start()
    
    for request in test_requests:
        priority_scheduler.submit_request(request)
        time.sleep(0.05)  # Small delay
    
    # Wait for completion
    while len(priority_scheduler.completed_requests) < len(test_requests):
        time.sleep(0.1)
    
    priority_scheduler.stop()
    
    # Analyze priority scheduling results
    priority_completion_order = []
    priority_latencies = []
    
    # Sort by completion time to get order
    sorted_requests = sorted(priority_scheduler.completed_requests, 
                           key=lambda r: r.completion_time)
    
    for request in sorted_requests:
        priority_completion_order.append(request.priority.name)
        priority_latencies.append(request.latency)
    
    return {
        'fifo_order': fifo_completion_order,
        'fifo_latencies': fifo_latencies,
        'priority_order': priority_completion_order,
        'priority_latencies': priority_latencies
    }

# Compare methods
comparison = compare_fifo_vs_priority()

print("FIFO Completion Order:", comparison['fifo_order'])
print("Priority Completion Order:", comparison['priority_order'])

print(f"\nAverage Latencies:")
print(f"FIFO: {np.mean(comparison['fifo_latencies']):.2f}s")
print(f"Priority: {np.mean(comparison['priority_latencies']):.2f}s")

## 5. Visualizing Priority Impact

In [None]:
# Simulate larger workload
def simulate_priority_workload(num_requests=30):
    scheduler = PriorityScheduler(max_batch_size=6)
    scheduler.start()
    
    # Generate mixed priority workload
    priorities = [Priority.HIGH, Priority.MEDIUM, Priority.LOW]
    priority_weights = [0.2, 0.5, 0.3]  # 20% high, 50% medium, 30% low
    
    for i in range(num_requests):
        priority = np.random.choice(priorities, p=priority_weights)
        max_tokens = np.random.randint(5, 20)
        
        # Set SLA targets based on priority
        sla_targets = {
            Priority.HIGH: 2.0,
            Priority.MEDIUM: 5.0,
            Priority.LOW: 10.0
        }
        
        request = PriorityRequest(
            id=i,
            prompt=f"Request {i}",
            priority=priority,
            max_tokens=max_tokens,
            sla_target=sla_targets[priority]
        )
        
        scheduler.submit_request(request)
        time.sleep(0.1)  # Arrival rate
    
    # Wait for completion
    while len(scheduler.completed_requests) < num_requests:
        time.sleep(0.1)
    
    scheduler.stop()
    return scheduler

# Run simulation
scheduler = simulate_priority_workload(30)
metrics = scheduler.get_metrics_by_priority()

# Plot results
priorities = list(metrics.keys())
avg_latencies = [metrics[p]['avg_latency'] for p in priorities]
sla_compliance = [metrics[p]['sla_compliance'] or 0 for p in priorities]

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.bar(priorities, avg_latencies, color=['red', 'orange', 'green'])
plt.ylabel('Average Latency (s)')
plt.title('Latency by Priority Level')
plt.grid(axis='y', alpha=0.3)

plt.subplot(1, 2, 2)
plt.bar(priorities, [s * 100 for s in sla_compliance], color=['red', 'orange', 'green'])
plt.ylabel('SLA Compliance (%)')
plt.title('SLA Compliance by Priority')
plt.ylim(0, 100)
plt.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

# Print summary
for priority_name, data in metrics.items():
    print(f"{priority_name}: {data['avg_latency']:.2f}s avg, {data['sla_compliance']:.1%} SLA compliance")

## Conclusion

Priority-based scheduling provides several key benefits:

1. **SLA Guarantees**: High-priority requests get faster service
2. **Fair Resource Allocation**: Different user tiers get appropriate service levels
3. **Business Value**: Critical requests are processed first
4. **Flexibility**: Can adapt to different business requirements

This makes priority scheduling essential for production LLM systems serving multiple user classes with different service level requirements.