In [None]:
import asyncio, random, time, math, statistics
from dataclasses import dataclass, field
from collections import deque

def now_ms():
    return time.perf_counter() * 1000.0

def pctl(xs, p):
    if not xs:
        return None
    xs2 = sorted(xs)
    k = (len(xs2) - 1) * p
    f = math.floor(k)
    c = math.ceil(k)
    if f == c:
        return xs2[int(k)]
    return xs2[f] + (xs2[c] - xs2[f]) * (k - f)

@dataclass
class Stats:
    latencies_ms: list = field(default_factory=list)
    ok: int = 0
    fail: int = 0
    dropped: int = 0
    retries: int = 0
    timeouts: int = 0
    cb_open: int = 0
    dlq: int = 0

    def summary(self, name):
        l = self.latencies_ms
        return {
            "name": name,
            "ok": self.ok,
            "fail": self.fail,
            "dropped": self.dropped,
            "retries": self.retries,
            "timeouts": self.timeouts,
            "cb_open": self.cb_open,
            "dlq": self.dlq,
            "lat_p50_ms": round(pctl(l, 0.50), 2) if l else None,
            "lat_p95_ms": round(pctl(l, 0.95), 2) if l else None,
            "lat_p99_ms": round(pctl(l, 0.99), 2) if l else None,
            "lat_mean_ms": round(statistics.mean(l), 2) if l else None,
        }

In [None]:
@dataclass
class FailureModel:
    base_latency_ms: float = 8.0
    jitter_ms: float = 6.0
    fail_prob: float = 0.05
    overload_fail_prob: float = 0.40
    overload_latency_ms: float = 50.0

    def sample(self, load_factor: float):
        base = self.base_latency_ms + random.random() * self.jitter_ms
        if load_factor > 1.0:
            base += (load_factor - 1.0) * self.overload_latency_ms
            fail_p = min(0.95, self.fail_prob + (load_factor - 1.0) * self.overload_fail_prob)
        else:
            fail_p = self.fail_prob
        return base, (random.random() < fail_p)

class CircuitBreaker:
    def __init__(self, fail_threshold=8, window=20, open_ms=500):
        self.fail_threshold = fail_threshold
        self.window = window
        self.open_ms = open_ms
        self.events = deque(maxlen=window)
        self.open_until_ms = 0.0

    def allow(self):
        return now_ms() >= self.open_until_ms

    def record(self, ok: bool):
        self.events.append(not ok)
        if len(self.events) >= self.window and sum(self.events) >= self.fail_threshold:
            self.open_until_ms = now_ms() + self.open_ms

class Bulkhead:
    def __init__(self, limit):
        self.sem = asyncio.Semaphore(limit)

    async def __aenter__(self):
        await self.sem.acquire()

    async def __aexit__(self, exc_type, exc, tb):
        self.sem.release()

def exp_backoff(attempt, base_ms=20, cap_ms=400):
    return random.random() * min(cap_ms, base_ms * (2 ** (attempt - 1)))

In [None]:
class DownstreamService:
    def __init__(self, fm: FailureModel, capacity_rps=250):
        self.fm = fm
        self.capacity_rps = capacity_rps
        self._inflight = 0

    async def handle(self, payload: dict):
        self._inflight += 1
        try:
            load_factor = max(0.5, self._inflight / (self.capacity_rps / 10))
            lat, should_fail = self.fm.sample(load_factor)
            await asyncio.sleep(lat / 1000.0)
            if should_fail:
                raise RuntimeError("downstream_error")
            return {"status": "ok"}
        finally:
            self._inflight -= 1

async def rpc_call(
    svc,
    req,
    stats,
    timeout_ms=120,
    max_retries=0,
    cb=None,
    bulkhead=None,
):
    t0 = now_ms()
    if cb and not cb.allow():
        stats.cb_open += 1
        stats.fail += 1
        return False

    attempt = 0
    while True:
        attempt += 1
        try:
            if bulkhead:
                async with bulkhead:
                    await asyncio.wait_for(svc.handle(req), timeout=timeout_ms / 1000.0)
            else:
                await asyncio.wait_for(svc.handle(req), timeout=timeout_ms / 1000.0)
            stats.latencies_ms.append(now_ms() - t0)
            stats.ok += 1
            if cb: cb.record(True)
            return True
        except asyncio.TimeoutError:
            stats.timeouts += 1
        except Exception:
            pass
        stats.fail += 1
        if cb: cb.record(False)
        if attempt <= max_retries:
            stats.retries += 1
            await asyncio.sleep(exp_backoff(attempt) / 1000.0)
            continue
        return False

In [None]:
@dataclass
class Event:
    id: int
    tries: int = 0

class EventBus:
    def __init__(self, max_queue=5000):
        self.q = asyncio.Queue(maxsize=max_queue)

    async def publish(self, e: Event):
        try:
            self.q.put_nowait(e)
            return True
        except asyncio.QueueFull:
            return False

async def event_consumer(
    bus,
    svc,
    stats,
    stop,
    max_retries=0,
    dlq=None,
    bulkhead=None,
    timeout_ms=200,
):
    while not stop.is_set() or not bus.q.empty():
        try:
            e = await asyncio.wait_for(bus.q.get(), timeout=0.2)
        except asyncio.TimeoutError:
            continue

        t0 = now_ms()
        e.tries += 1
        try:
            if bulkhead:
                async with bulkhead:
                    await asyncio.wait_for(svc.handle({"id": e.id}), timeout=timeout_ms / 1000.0)
            else:
                await asyncio.wait_for(svc.handle({"id": e.id}), timeout=timeout_ms / 1000.0)
            stats.ok += 1
            stats.latencies_ms.append(now_ms() - t0)
        except Exception:
            stats.fail += 1
            if e.tries <= max_retries:
                stats.retries += 1
                await asyncio.sleep(exp_backoff(e.tries) / 1000.0)
                await bus.publish(e)
            else:
                stats.dlq += 1
                if dlq is not None:
                    dlq.append(e)
        finally:
            bus.q.task_done()

In [3]:
async def generate_requests(total=2000, burst=350, gap_ms=80):
    reqs = []
    rid = 0
    while rid < total:
        n = min(burst, total - rid)
        for _ in range(n):
            reqs.append(rid)
            rid += 1
        await asyncio.sleep(gap_ms / 1000.0)
    return reqs

async def main():
    random.seed(7)
    fm = FailureModel()
    svc = DownstreamService(fm)
    ids = await generate_requests()

    rpc_stats = Stats()
    cb = CircuitBreaker()
    bulk = Bulkhead(40)

    await asyncio.gather(*[
        rpc_call(svc, {"id": i}, rpc_stats, max_retries=3, cb=cb, bulkhead=bulk)
        for i in ids
    ])

    bus = EventBus()
    ev_stats = Stats()
    stop = asyncio.Event()
    dlq = []

    consumers = [
        asyncio.create_task(event_consumer(bus, svc, ev_stats, stop, max_retries=3, dlq=dlq))
        for _ in range(16)
    ]

    for i in ids:
        await bus.publish(Event(i))

    await bus.q.join()
    stop.set()
    for c in consumers:
        c.cancel()

    print(rpc_stats.summary("RPC"))
    print(ev_stats.summary("EventDriven"))
    print("DLQ size:", len(dlq))

await main()

  def __init__(self, when, callback, args, loop, context=None):



=== Metrics Summary ===
name                                           | ok   | fail | dropped | retries | timeouts | cb_open | dlq | lat_p50_ms | lat_p95_ms | lat_p99_ms | throughput_rps | total_ms
-----------------------------------------------+------+------+---------+---------+----------+---------+-----+------------+------------+------------+----------------+---------
RPC naive (retries=3, no CB)                   | 57   | 9396 | 0       | 7053    | 9354     | 0       | 0   | 17.81      | 528.84     | 581.83     | 470.7          | 5098.5  
RPC safe (retries=1, CB+bulkhead)              | 201  | 2394 | 0       | 195     | 0        | 2074    | 0   | 352.01     | 787.99     | 807.33     | 2821.3         | 850.7   
Event naive (retries=3, small queue)           | 1400 | 44   | 1000    | 44      | 0        | 0       | 0   | 12.68      | 16.18      | 16.67      | 1086.8         | 1288.1  
Event safe (retries=1, bulkhead, larger queue) | 2400 | 72   | 0       | 72      | 0        | 0     