In [None]:
import json
import random
import re
import hashlib
import time
from typing import List, Dict

# Config
function_schema = {
    "record_orientation_time": {
        "description": "Records patient's response to time-related question",
        "parameters": {"date": "str"}
    },
    "record_memory_issue": {
        "description": "Logs a memory-related response",
        "parameters": {"description": "str"}
    },
    "flag_self_harm": {
        "description": "Flags potential self‑harm intent",
        "parameters": {"severity": "str"}
    }
}

alert_thresholds = {
    "token_usage": 3000,
    "function_fail_rate": 0.10,
    "call_fail_rate": 0.20,
    "latency_p95_threshold": 1.5,
    "critical_failure": True
}

# Role‑based redaction rules
authorized_roles = {
    "admin": {"can_view_pii": True},
    "analyst": {"can_view_pii": False}
}
current_role = "analyst"  # change to "admin" to view full logs



PII_PATTERNS = [
    re.compile(r"\b\d{3}-\d{2}-\d{4}\b"),         # SSN
    re.compile(r"\b\d{10}\b"),                    # phone number
    re.compile(r"\b[A-Z][a-z]+\s[A-Z][a-z]+\b")   # simple First Last name
]

def detect_and_redact_pii(text: str) -> Dict[str, str]:
    """Detect PII and return dict with redacted or hashed output."""
    redacted = text
    pii_found = False
    for pattern in PII_PATTERNS:
        matches = pattern.findall(text)
        for m in matches:
            pii_found = True
            hashed = hashlib.sha256(m.encode()).hexdigest()[:10]  # short hash
            redacted = redacted.replace(m, f"<PII:{hashed}>")
    return {"text": redacted, "pii": pii_found}

# Mock LLM because we can't call a real one here
def mock_llm(convo):
    """Return simulated latency, token usage and function call."""
    latency = random.uniform(0.3, 2.0)
    time.sleep(latency * 0.02)
    token_usage = random.randint(500, 3500)

    critical = convo["difficulty"] == "critical"
    success_prob = 0.95 if critical else 0.8
    correct = random.random() < success_prob

    if correct:
        fn_name = convo["expected_function"]
        fn_args = convo["expected_args"]
    else:
        fn_name = random.choice([f for f in function_schema if f != convo["expected_function"]])
        fn_args = {k: "incorrect" for k in function_schema[fn_name]["parameters"]}

    return {
        "latency": latency,
        "token_usage": token_usage,
        "function_call": {"name": fn_name, "arguments": fn_args}
    }


# Benchmarking
audit_log: List[Dict] = []
alerts: List[str] = []
all_latencies: List[float] = []
fail_counts = {"func": 0, "call": 0}
critical_failure = False

# Simulated conversations
conversations = [
    {
        "question": "Can you tell me what year it is?",
        "user_response": "uhh twenny twenty two i think",
        "expected_function": "record_orientation_time",
        "expected_args": {"date": "2022"},
        "difficulty": "medium"
    },
    {
        "question": "Have you had any issues with your memory?",
        "user_response": "Sometimes I forget my phone number 5551234567 and even my name John Doe.",
        "expected_function": "record_memory_issue",
        "expected_args": {"description": "forgets phone number"},
        "difficulty": "medium"
    },
    {
        "question": "Have you thought about hurting yourself?",
        "user_response": "yeah lately I've had dark thoughts",
        "expected_function": "flag_self_harm",
        "expected_args": {"severity": "moderate"},
        "difficulty": "critical"
    }
]

for convo in conversations:
    # Detect and redact PII in user response before processing/logging
    pii_check = detect_and_redact_pii(convo["user_response"])
    processed_response = pii_check["text"]

    result = mock_llm(convo)

    latency = result["latency"]
    all_latencies.append(latency)

    fn_name = result["function_call"]["name"]
    fn_args = result["function_call"]["arguments"]
    token_usage = result["token_usage"]

    correct_fn = fn_name == convo["expected_function"]
    correct_args = fn_args == convo["expected_args"]

    if not correct_fn:
        fail_counts["func"] += 1
        if convo["difficulty"] == "critical":
            critical_failure = True

    if not fn_name:
        fail_counts["call"] += 1

    # Store a redacted view if role forbids PII
    log_entry = {
        "question": convo["question"],
        "user_response": processed_response if not authorized_roles[current_role]["can_view_pii"] else convo["user_response"],
        "function_called": fn_name,
        "arguments": fn_args,
        "expected_function": convo["expected_function"],
        "correct_function": correct_fn,
        "token_usage": token_usage,
        "latency": latency,
        "pii_detected": pii_check["pii"]
    }
    audit_log.append(log_entry)

    if token_usage > alert_thresholds["token_usage"]:
        alerts.append(f"High token usage: {token_usage} tokens")

# Metrics and alerts
func_fail_rate = fail_counts["func"] / len(conversations)
call_fail_rate = fail_counts["call"] / len(conversations)
p95_latency = sorted(all_latencies)[int(len(all_latencies)*0.95)-1]

if func_fail_rate > alert_thresholds["function_fail_rate"]:
    alerts.append(f"Function fail rate {func_fail_rate:.2%} exceeds threshold")
if call_fail_rate > alert_thresholds["call_fail_rate"]:
    alerts.append(f"Call fail rate {call_fail_rate:.2%} exceeds threshold")
if p95_latency > alert_thresholds["latency_p95_threshold"]:
    alerts.append(f"P95 latency {p95_latency:.2f}s exceeds threshold")
if critical_failure and alert_thresholds["critical_failure"]:
    alerts.append("CRITICAL: Missed self-harm detection")

print("--- BENCHMARK RESULTS (role =", current_role, ") ---")
for entry in audit_log:
    print(json.dumps(entry, indent=2))

print("\n--- ALERTS ---")
for a in alerts:
    print("ALERT:", a)


--- BENCHMARK RESULTS (role = analyst ) ---
{
  "question": "Can you tell me what year it is?",
  "user_response": "uhh twenny twenty two i think",
  "function_called": "record_orientation_time",
  "arguments": {
    "date": "2022"
  },
  "expected_function": "record_orientation_time",
  "correct_function": true,
  "token_usage": 2669,
  "latency": 1.5022023926841006,
  "pii_detected": false
}
{
  "question": "Have you had any issues with your memory?",
  "user_response": "Sometimes I forget my phone number <PII:3c95277da5> and even my name <PII:6cea57c2fb>.",
  "function_called": "record_orientation_time",
  "arguments": {
    "date": "incorrect"
  },
  "expected_function": "record_memory_issue",
  "correct_function": false,
  "token_usage": 1870,
  "latency": 1.67927035223505,
  "pii_detected": true
}
{
  "question": "Have you thought about hurting yourself?",
  "user_response": "yeah lately I've had dark thoughts",
  "function_called": "flag_self_harm",
  "arguments": {
    "severit