# 04 — Alertas y Monitoreo de Calidad

**Objetivo**: Implementar quality signals, ventana deslizante, umbrales de alerta, y tests canary.

## Contenido
1. Quality signals (judge score, format compliance, hallucination)
2. Ventana deslizante de metricas
3. Umbrales de alerta
4. Tests canary

In [None]:
import os
import json
import time
import numpy as np
import matplotlib.pyplot as plt
from collections import deque
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()

client = OpenAI()
MODEL = "gpt-5-mini"

print("=" * 60)
print("ALERTAS Y MONITOREO DE CALIDAD")
print("=" * 60)

In [None]:
# ============================================================
# QUALITY SIGNALS
# ============================================================

class QualityMonitor:
    """Monitor de calidad con ventana deslizante."""
    
    def __init__(self, window_size: int = 20):
        self.window_size = window_size
        self.scores: deque[float] = deque(maxlen=window_size)
        self.format_ok: deque[bool] = deque(maxlen=window_size)
        self.latencies: deque[float] = deque(maxlen=window_size)
        self.history: list[dict] = []
        
        # Umbrales de alerta
        self.thresholds = {
            "min_avg_score": 3.0,       # Score minimo promedio
            "max_format_fail_rate": 0.2, # Max 20% de fallos de formato
            "max_p95_latency": 5000,     # Max 5s en p95
        }
    
    def record(self, score: float, format_valid: bool, latency_ms: float, label: str = ""):
        """Registra una observacion."""
        self.scores.append(score)
        self.format_ok.append(format_valid)
        self.latencies.append(latency_ms)
        
        entry = {
            "label": label,
            "score": score,
            "format_valid": format_valid,
            "latency_ms": latency_ms,
            "timestamp": time.time(),
        }
        self.history.append(entry)
        
        # Check alertas
        alerts = self.check_alerts()
        if alerts:
            for alert in alerts:
                print(f"  ALERTA: {alert}")
        
        return entry
    
    def check_alerts(self) -> list[str]:
        """Verifica umbrales y genera alertas."""
        alerts = []
        
        if len(self.scores) >= 5:
            avg_score = np.mean(list(self.scores))
            if avg_score < self.thresholds["min_avg_score"]:
                alerts.append(f"Score promedio bajo: {avg_score:.1f} < {self.thresholds['min_avg_score']}")
        
        if len(self.format_ok) >= 5:
            fail_rate = 1 - (sum(self.format_ok) / len(self.format_ok))
            if fail_rate > self.thresholds["max_format_fail_rate"]:
                alerts.append(f"Tasa de fallo de formato alta: {fail_rate:.0%} > {self.thresholds['max_format_fail_rate']:.0%}")
        
        if len(self.latencies) >= 5:
            p95 = np.percentile(list(self.latencies), 95)
            if p95 > self.thresholds["max_p95_latency"]:
                alerts.append(f"Latencia p95 alta: {p95:.0f}ms > {self.thresholds['max_p95_latency']}ms")
        
        return alerts
    
    def summary(self) -> dict:
        if not self.scores:
            return {"status": "no_data"}
        return {
            "window_size": len(self.scores),
            "avg_score": round(np.mean(list(self.scores)), 2),
            "format_pass_rate": round(sum(self.format_ok) / len(self.format_ok) * 100, 1),
            "latency_p50": round(np.percentile(list(self.latencies), 50), 0),
            "latency_p95": round(np.percentile(list(self.latencies), 95), 0),
        }


monitor = QualityMonitor(window_size=20)
print("QualityMonitor inicializado")
print(f"Umbrales: {monitor.thresholds}")

In [None]:
# ============================================================
# SIMULACION DE MONITOREO
# ============================================================

preguntas_monitor = [
    "Que es Batman?",
    "Explica el sentido aracnido",
    "Que paso en Year One?",
    "Quien es Venom?",
    "Que es el Tribunal de los Buhos?",
    "Como murio Gwen Stacy?",
    "Que es la Batcueva?",
    "Explica la Saga del Clon",
    "Que filosofia tiene Batman?",
    "Describe los poderes de Spider-Man",
    "Que es Knightfall?",
    "Como funciona Civil War?",
    "Quien es Catwoman?",
    "Que es Spider-Verse?",
    "Describe la Liga de la Justicia",
]

print("=" * 60)
print("SIMULACION DE MONITOREO (15 requests)")
print("=" * 60)

for i, pregunta in enumerate(preguntas_monitor):
    t0 = time.time()
    response = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": "Responde en 2-3 oraciones sobre comics, en español."},
            {"role": "user", "content": pregunta},
        ],
        max_tokens=150,
    )
    latencia = (time.time() - t0) * 1000
    
    respuesta = response.choices[0].message.content
    
    # Evaluar calidad (simplificado)
    score = min(5, max(1, len(respuesta.split()) // 10))  # Heuristico simple
    format_ok = len(respuesta) > 20 and len(respuesta) < 2000
    
    monitor.record(score, format_ok, latencia, label=f"q{i+1}")
    
    if (i + 1) % 5 == 0:
        s = monitor.summary()
        print(f"\n  Checkpoint {i+1}: avg_score={s['avg_score']}, format_pass={s['format_pass_rate']}%, p50={s['latency_p50']}ms")

print(f"\nResumen final: {monitor.summary()}")

## 2. Tests Canary

Preguntas con respuesta conocida para detectar regresiones.

In [None]:
# ============================================================
# TESTS CANARY
# ============================================================

canary_tests = [
    {
        "pregunta": "En que ciudad opera Batman?",
        "keywords_esperadas": ["gotham"],
        "tipo": "factual",
    },
    {
        "pregunta": "Cual es el nombre real de Spider-Man?",
        "keywords_esperadas": ["peter", "parker"],
        "tipo": "factual",
    },
    {
        "pregunta": "Que regla fundamental tiene Batman sobre matar?",
        "keywords_esperadas": ["no matar", "no mata"],
        "tipo": "conceptual",
    },
    {
        "pregunta": "Quien es el tio de Peter Parker?",
        "keywords_esperadas": ["ben"],
        "tipo": "factual",
    },
]


def run_canary_tests(tests: list[dict]) -> dict:
    """Ejecuta tests canary y reporta resultados."""
    results = {"passed": 0, "failed": 0, "details": []}
    
    for test in tests:
        response = client.chat.completions.create(
            model=MODEL,
            messages=[
                {"role": "system", "content": "Responde en español, brevemente."},
                {"role": "user", "content": test["pregunta"]},
            ],
            max_tokens=100,
        )
        
        respuesta = response.choices[0].message.content.lower()
        keywords_found = any(kw in respuesta for kw in test["keywords_esperadas"])
        
        if keywords_found:
            results["passed"] += 1
            status = "PASS"
        else:
            results["failed"] += 1
            status = "FAIL"
        
        results["details"].append({
            "pregunta": test["pregunta"],
            "status": status,
            "respuesta": respuesta[:100],
        })
    
    return results


print("=" * 60)
print("TESTS CANARY")
print("=" * 60)

canary_results = run_canary_tests(canary_tests)

for detail in canary_results["details"]:
    print(f"  [{detail['status']:4s}] {detail['pregunta']}")
    print(f"         {detail['respuesta'][:80]}...")

print(f"\nResultado: {canary_results['passed']}/{len(canary_tests)} passed")

if canary_results["failed"] > 0:
    print("  ALERTA: Tests canary fallidos detectados!")

## 3. Dashboard de Calidad

In [None]:
# ============================================================
# DASHBOARD DE CALIDAD
# ============================================================

fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# Panel 1: Score de calidad over time
scores = [h["score"] for h in monitor.history]
axes[0].plot(range(1, len(scores) + 1), scores, "o-", color="#2196F3", markersize=4)
axes[0].axhline(monitor.thresholds["min_avg_score"], color="red", linestyle="--", label=f"Umbral: {monitor.thresholds['min_avg_score']}")
avg = np.mean(scores)
axes[0].axhline(avg, color="green", linestyle=":", label=f"Media: {avg:.1f}")
axes[0].set_xlabel("Request #")
axes[0].set_ylabel("Quality Score")
axes[0].set_title("Quality Score (ventana deslizante)")
axes[0].set_ylim(0, 5.5)
axes[0].legend(fontsize=8)

# Panel 2: Latencia over time
lats = [h["latency_ms"] for h in monitor.history]
axes[1].fill_between(range(1, len(lats) + 1), lats, alpha=0.3, color="#FF9800")
axes[1].plot(range(1, len(lats) + 1), lats, "-", color="#FF9800")
axes[1].axhline(monitor.thresholds["max_p95_latency"], color="red", linestyle="--", label=f"Umbral p95: {monitor.thresholds['max_p95_latency']}ms")
axes[1].set_xlabel("Request #")
axes[1].set_ylabel("Latencia (ms)")
axes[1].set_title("Latencia por Request")
axes[1].legend(fontsize=8)

# Panel 3: Canary results
labels = [d["pregunta"][:20] + "..." for d in canary_results["details"]]
colors = ["#4CAF50" if d["status"] == "PASS" else "#FF5722" for d in canary_results["details"]]
axes[2].barh(labels, [1] * len(labels), color=colors, alpha=0.8)
axes[2].set_xlabel("Status")
axes[2].set_title("Tests Canary")
axes[2].set_xlim(0, 1.2)

plt.tight_layout()
plt.savefig("../data/dashboard_calidad.png", dpi=150, bbox_inches="tight")
plt.show()

## Takeaways

1. **Quality signals** cuantifican la salud del sistema en tiempo real
2. **Ventana deslizante** suaviza el ruido y detecta tendencias
3. **Umbrales de alerta** automatizan la deteccion de degradacion
4. **Tests canary** son preguntas con respuesta conocida que detectan regresiones
5. Ejecutar canary tests **periodicamente** (cada N requests o cada hora)
6. **Dashboard** combina todas las metricas para toma de decisiones
7. En produccion, conectar alertas a Slack/PagerDuty para respuesta inmediata