In [None]:
!pip -q install -U openai
!pip -q install -U "git+https://github.com/openai/swarm.git"

import os

def load_openai_key():
    try:
        from google.colab import userdata
        key = userdata.get("OPENAI_API_KEY")
    except Exception:
        key = None
    if not key:
        import getpass
        key = getpass.getpass("Enter OPENAI_API_KEY (hidden): ").strip()
    if not key:
        raise RuntimeError("OPENAI_API_KEY not provided")
    return key

os.environ["OPENAI_API_KEY"] = load_openai_key()

In [None]:
import json
import re
from typing import List, Dict
from swarm import Swarm, Agent

client = Swarm()

In [None]:
KB_DOCS = [
    {
        "id": "kb-incident-001",
        "title": "API Latency Incident Playbook",
        "text": "If p95 latency spikes, validate deploys, dependencies, and error rates. Rollback, cache, rate-limit, scale. Compare p50 vs p99 and inspect upstream timeouts."
    },
    {
        "id": "kb-risk-001",
        "title": "Risk Communication Guidelines",
        "text": "Updates must include impact, scope, mitigation, owner, and next update. Avoid blame and separate internal vs external messaging."
    },
    {
        "id": "kb-ops-001",
        "title": "On-call Handoff Template",
        "text": "Include summary, timeline, current status, mitigations, open questions, next actions, and owners."
    },
]

def _normalize(s: str) -> List[str]:
    return re.sub(r"[^a-z0-9\s]", " ", s.lower()).split()

def search_kb(query: str, top_k: int = 3) -> str:
    q = set(_normalize(query))
    scored = []
    for d in KB_DOCS:
        score = len(q.intersection(set(_normalize(d["title"] + " " + d["text"]))))
        scored.append((score, d))
    scored.sort(key=lambda x: x[0], reverse=True)
    docs = [d for s, d in scored[:top_k] if s > 0] or [scored[0][1]]
    return json.dumps(docs, indent=2)

In [None]:
def estimate_mitigation_impact(options_json: str) -> str:
    try:
        options = json.loads(options_json)
    except Exception as e:
        return json.dumps({"error": str(e)})
    ranking = []
    for o in options:
        conf = float(o.get("confidence", 0.5))
        risk = o.get("risk", "medium")
        penalty = {"low": 0.1, "medium": 0.25, "high": 0.45}.get(risk, 0.25)
        ranking.append({
            "option": o.get("option"),
            "confidence": conf,
            "risk": risk,
            "score": round(conf - penalty, 3)
        })
    ranking.sort(key=lambda x: x["score"], reverse=True)
    return json.dumps(ranking, indent=2)

In [None]:
def handoff_to_sre():
    return sre_agent

def handoff_to_comms():
    return comms_agent

def handoff_to_handoff_writer():
    return handoff_writer_agent

def handoff_to_critic():
    return critic_agent

In [None]:
triage_agent = Agent(
    name="Triage",
    model="gpt-4o-mini",
    instructions="""
Decide which agent should handle the request.
Use SRE for incident response.
Use Comms for customer or executive messaging.
Use HandoffWriter for on-call notes.
Use Critic for review or improvement.
""",
    functions=[search_kb, handoff_to_sre, handoff_to_comms, handoff_to_handoff_writer, handoff_to_critic]
)

sre_agent = Agent(
    name="SRE",
    model="gpt-4o-mini",
    instructions="""
Produce a structured incident response with triage steps,
ranked mitigations, ranked hypotheses, and a 30-minute plan.
""",
    functions=[search_kb, estimate_mitigation_impact]
)

comms_agent = Agent(
    name="Comms",
    model="gpt-4o-mini",
    instructions="""
Produce an external customer update and an internal technical update.
""",
    functions=[search_kb]
)

handoff_writer_agent = Agent(
    name="HandoffWriter",
    model="gpt-4o-mini",
    instructions="""
Produce a clean on-call handoff document with standard headings.
""",
    functions=[search_kb]
)

critic_agent = Agent(
    name="Critic",
    model="gpt-4o-mini",
    instructions="""
Critique the previous answer, then produce a refined final version and a checklist.
"""
)

In [2]:
def run_pipeline(user_request: str):
    messages = [{"role": "user", "content": user_request}]
    r1 = client.run(agent=triage_agent, messages=messages, max_turns=8)
    messages2 = r1.messages + [{"role": "user", "content": "Review and improve the last answer"}]
    r2 = client.run(agent=critic_agent, messages=messages2, max_turns=4)
    return r2.messages[-1]["content"]

request = """
Production p95 latency jumped from 250ms to 2.5s after a deploy.
Errors slightly increased, DB CPU stable, upstream timeouts rising.
Provide a 30-minute action plan and a customer update.
"""

print(run_pipeline(request))

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Enter OPENAI_API_KEY (hidden): ··········

FINAL AGENT: Critic
### 1) 5-Point Critique:

1. **Clarity and Brevity:** The updates should maintain a clear separation between internal and external communications. The current phrasing can be streamlined, especially the internal update which is verbose.

2. **Action Items Specificity:**  The actions proposed in the internal update are broad. Specific actions should be clearly defined, such as who exactly will execute them and deadlines for when they should be completed.

3. **Customer Context:** The external update lacks context for customers about why this issue may impact them or what they can expect while it’s resolved. Including a reassurance would improve customer confidence.

4. **Next Steps Measurement:** Mentioning specific metrics or expected outcomes for the next