# Lab-2.3 Part 2: Async Processing

## Objectives
- Master async/await patterns
- Implement concurrent request handling
- Build streaming responses (SSE)
- Create WebSocket endpoints

## Estimated Time: 60-90 minutes

---
## 1. Async Fundamentals

In [None]:
# Understanding async/await
import asyncio
import time

# Synchronous version
def sync_task(n):
    print(f"Task {n} started")
    time.sleep(1)  # Simulate work
    print(f"Task {n} completed")
    return n

# Async version
async def async_task(n):
    print(f"Task {n} started")
    await asyncio.sleep(1)  # Simulate work (non-blocking)
    print(f"Task {n} completed")
    return n

# Compare execution time
print("Synchronous execution:")
start = time.time()
for i in range(3):
    sync_task(i)
sync_time = time.time() - start
print(f"Time: {sync_time:.2f}s\n")

print("Async execution:")
start = time.time()
await asyncio.gather(*[async_task(i) for i in range(3)])
async_time = time.time() - start
print(f"Time: {async_time:.2f}s")

print(f"\nSpeedup: {sync_time/async_time:.1f}x ⚡")

---
## 2. Async FastAPI Endpoints

In [None]:
%%writefile app_async.py
from fastapi import FastAPI, BackgroundTasks
from pydantic import BaseModel
import asyncio
import time
from typing import Dict
import uuid

app = FastAPI()

# Job storage
jobs: Dict[str, dict] = {}

class JobRequest(BaseModel):
    prompt: str
    max_tokens: int = 100

class JobResponse(BaseModel):
    job_id: str
    status: str

class JobResult(BaseModel):
    job_id: str
    status: str
    result: str = None

async def process_generation(job_id: str, prompt: str, max_tokens: int):
    """Background task for generation."""
    jobs[job_id]["status"] = "processing"
    
    # Simulate generation (replace with actual model)
    await asyncio.sleep(2)
    
    jobs[job_id]["status"] = "completed"
    jobs[job_id]["result"] = f"Generated text for: {prompt}"

@app.post("/jobs/generate", response_model=JobResponse)
async def create_job(request: JobRequest, background_tasks: BackgroundTasks):
    """Create async generation job."""
    job_id = str(uuid.uuid4())
    
    jobs[job_id] = {
        "status": "queued",
        "prompt": request.prompt,
        "result": None,
    }
    
    # Add to background tasks
    background_tasks.add_task(
        process_generation,
        job_id,
        request.prompt,
        request.max_tokens
    )
    
    return JobResponse(job_id=job_id, status="queued")

@app.get("/jobs/{job_id}", response_model=JobResult)
async def get_job(job_id: str):
    """Get job status and result."""
    if job_id not in jobs:
        raise HTTPException(status_code=404, detail="Job not found")
    
    job = jobs[job_id]
    return JobResult(
        job_id=job_id,
        status=job["status"],
        result=job.get("result")
    )

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8002)

---
## 3. Streaming Responses (SSE)

In [None]:
%%writefile app_streaming.py
from fastapi import FastAPI
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
import asyncio
import json

app = FastAPI()

class StreamRequest(BaseModel):
    prompt: str
    max_tokens: int = 50

async def generate_stream(prompt: str, max_tokens: int):
    """Simulate streaming generation."""
    words = [
        "The", "future", "of", "artificial", "intelligence",
        "is", "very", "promising", "and", "exciting"
    ]
    
    for i, word in enumerate(words[:max_tokens]):
        await asyncio.sleep(0.1)  # Simulate generation delay
        
        data = {
            "token": word,
            "index": i,
            "done": i == len(words) - 1
        }
        
        # SSE format
        yield f"data: {json.dumps(data)}\n\n"
    
    # End signal
    yield "data: [DONE]\n\n"

@app.post("/v1/completions/stream")
async def stream_generate(request: StreamRequest):
    """Stream generation with Server-Sent Events."""
    return StreamingResponse(
        generate_stream(request.prompt, request.max_tokens),
        media_type="text/event-stream",
        headers={
            "Cache-Control": "no-cache",
            "Connection": "keep-alive",
        }
    )

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8003)

### Test Streaming

**Note**: Streaming requires special client handling.

In [None]:
# Test streaming endpoint
import requests
import json

def test_streaming(prompt: str):
    """Test SSE streaming."""
    url = "http://localhost:8003/v1/completions/stream"
    
    data = {
        "prompt": prompt,
        "max_tokens": 10
    }
    
    try:
        response = requests.post(url, json=data, stream=True)
        
        print(f"Streaming response for: '{prompt}'\n")
        
        for line in response.iter_lines():
            if line:
                line = line.decode('utf-8')
                if line.startswith('data: '):
                    data = line[6:]  # Remove 'data: ' prefix
                    if data == '[DONE]':
                        print("\n\n✅ Stream complete")
                        break
                    else:
                        token_data = json.loads(data)
                        print(token_data['token'], end=' ', flush=True)
    
    except requests.exceptions.ConnectionError:
        print("❌ Server not running on port 8003")

# Test
# test_streaming("The future of AI")

---
## 4. WebSocket Implementation

In [None]:
%%writefile app_websocket.py
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
import asyncio
import json

app = FastAPI()

@app.websocket("/ws/generate")
async def websocket_generate(websocket: WebSocket):
    """WebSocket endpoint for real-time generation."""
    await websocket.accept()
    
    try:
        # Receive request
        data = await websocket.receive_json()
        prompt = data.get("prompt", "")
        max_tokens = data.get("max_tokens", 50)
        
        # Simulate streaming generation
        words = prompt.split() + [
            "is", "a", "fascinating", "topic", "that", "requires", "further", "study"
        ]
        
        for i, word in enumerate(words[:max_tokens]):
            await asyncio.sleep(0.1)
            
            await websocket.send_json({
                "type": "token",
                "data": word,
                "index": i
            })
        
        # Send completion
        await websocket.send_json({"type": "done"})
        
    except WebSocketDisconnect:
        print("Client disconnected")
    finally:
        await websocket.close()

@app.get("/")
async def root():
    return {"message": "WebSocket server running on /ws/generate"}

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8004)

### WebSocket Client Test

In [None]:
# WebSocket client test (conceptual)
# Requires: pip install websockets

example_code = '''
import asyncio
import websockets
import json

async def test_websocket():
    uri = "ws://localhost:8004/ws/generate"
    
    async with websockets.connect(uri) as websocket:
        # Send request
        await websocket.send(json.dumps({
            "prompt": "Machine learning",
            "max_tokens": 10
        }))
        
        # Receive tokens
        while True:
            message = await websocket.recv()
            data = json.loads(message)
            
            if data["type"] == "done":
                break
            
            print(data["data"], end=" ", flush=True)

# Run: asyncio.run(test_websocket())
'''

print("WebSocket Client Example:")
print("=" * 60)
print(example_code)
print("=" * 60)

---
## 5. Concurrent Request Handling

In [None]:
%%writefile app_concurrent.py
from fastapi import FastAPI
from pydantic import BaseModel
import asyncio
from collections import deque
import time

app = FastAPI()

class GenerateRequest(BaseModel):
    prompt: str
    max_tokens: int = 50

# Request queue
request_queue = deque(maxlen=100)
active_requests = 0
MAX_CONCURRENT = 10

async def generate_text(prompt: str, max_tokens: int) -> str:
    """Simulate LLM generation."""
    global active_requests
    
    # Wait if too many active requests
    while active_requests >= MAX_CONCURRENT:
        await asyncio.sleep(0.1)
    
    active_requests += 1
    
    try:
        # Simulate generation
        await asyncio.sleep(1)
        return f"Generated: {prompt} (simulated)"
    finally:
        active_requests -= 1

@app.post("/generate")
async def generate(request: GenerateRequest):
    """Handle generation with concurrency control."""
    result = await generate_text(request.prompt, request.max_tokens)
    
    return {
        "text": result,
        "queue_size": len(request_queue),
        "active_requests": active_requests
    }

@app.get("/stats")
async def stats():
    return {
        "active_requests": active_requests,
        "queue_size": len(request_queue),
        "max_concurrent": MAX_CONCURRENT
    }

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8005)

### Load Test Concurrent Server

In [None]:
# Concurrent load test
import asyncio
import aiohttp
import time

async def send_request(session, url, prompt, request_id):
    """Send async request."""
    start = time.time()
    
    try:
        async with session.post(url, json={"prompt": prompt}) as response:
            result = await response.json()
            elapsed = time.time() - start
            return {
                "id": request_id,
                "time": elapsed,
                "success": True
            }
    except Exception as e:
        return {
            "id": request_id,
            "time": time.time() - start,
            "success": False,
            "error": str(e)
        }

async def load_test(num_requests=20):
    """Run concurrent load test."""
    url = "http://localhost:8005/generate"
    
    async with aiohttp.ClientSession() as session:
        tasks = [
            send_request(session, url, f"Test prompt {i}", i)
            for i in range(num_requests)
        ]
        
        start = time.time()
        results = await asyncio.gather(*tasks)
        total_time = time.time() - start
    
    # Analyze results
    successful = sum(1 for r in results if r["success"])
    latencies = [r["time"] for r in results if r["success"]]
    
    print(f"\nLoad Test Results ({num_requests} concurrent requests):")
    print("=" * 60)
    print(f"Total time:     {total_time:.2f}s")
    print(f"Successful:     {successful}/{num_requests}")
    print(f"Requests/sec:   {num_requests/total_time:.2f}")
    print(f"Avg latency:    {np.mean(latencies):.3f}s")
    print(f"Min latency:    {np.min(latencies):.3f}s")
    print(f"Max latency:    {np.max(latencies):.3f}s")
    print("=" * 60)

# Run test (if server is running)
# await load_test(20)

---
## Summary

✅ **Completed**:
1. Mastered async/await patterns
2. Implemented background tasks
3. Built streaming responses (SSE)
4. Created WebSocket endpoints
5. Handled concurrent requests

📚 **Key Patterns**:
- `async def` for async functions
- `await` for async operations
- `asyncio.gather()` for concurrency
- `StreamingResponse` for SSE
- `WebSocket` for bidirectional communication

💡 **Best Practices**:
- Use async for I/O-bound operations
- Limit concurrent requests to avoid overload
- Implement proper error handling
- Use streaming for better UX

➡️ **Next**: In `03-Integration_with_vLLM.ipynb`, we'll:
- Integrate AsyncLLMEngine
- Build production-ready endpoints
- Optimize concurrent performance

In [None]:
print("✅ Lab 2.3 Part 2 Complete!")