# 03 — Presupuesto y Control de Costos

**Objetivo**: Implementar control de costos con limites por request y sesion, terminacion graceful, y estrategias de optimizacion.

## Contenido
1. Clase `TokenBudget`
2. Limites por request y sesion
3. Terminacion graceful
4. Estrategias de optimizacion
5. Dashboard de costos

In [None]:
import os
import json
import time
import numpy as np
import matplotlib.pyplot as plt
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()

client = OpenAI()
MODEL = "gpt-5-mini"

print("=" * 60)
print("PRESUPUESTO Y CONTROL DE COSTOS")
print("=" * 60)

In [None]:
# ============================================================
# CLASE TokenBudget
# ============================================================

class TokenBudget:
    """Control de presupuesto de tokens por request y sesion."""
    
    PRICING = {
        "gpt-5-mini": {"input": 0.15, "output": 0.60},
        "gpt-5": {"input": 2.00, "output": 8.00},
    }
    
    def __init__(
        self,
        model: str = "gpt-5-mini",
        max_tokens_per_request: int = 4000,
        max_tokens_per_session: int = 50000,
        max_cost_per_session: float = 0.10,  # USD
    ):
        self.model = model
        self.max_tokens_per_request = max_tokens_per_request
        self.max_tokens_per_session = max_tokens_per_session
        self.max_cost_per_session = max_cost_per_session
        
        self.session_tokens = 0
        self.session_cost = 0.0
        self.requests: list[dict] = []
    
    def check_budget(self, estimated_tokens: int = 0) -> dict:
        """Verifica si hay presupuesto disponible."""
        remaining_tokens = self.max_tokens_per_session - self.session_tokens
        remaining_cost = self.max_cost_per_session - self.session_cost
        
        can_proceed = (
            remaining_tokens > 0
            and remaining_cost > 0
            and (estimated_tokens == 0 or estimated_tokens <= self.max_tokens_per_request)
        )
        
        return {
            "can_proceed": can_proceed,
            "remaining_tokens": remaining_tokens,
            "remaining_cost_usd": round(remaining_cost, 6),
            "session_usage_pct": round(self.session_tokens / self.max_tokens_per_session * 100, 1),
            "cost_usage_pct": round(self.session_cost / self.max_cost_per_session * 100, 1),
        }
    
    def record(self, input_tokens: int, output_tokens: int, label: str = "") -> dict:
        """Registra el uso de una request."""
        prices = self.PRICING.get(self.model, self.PRICING["gpt-5-mini"])
        cost = input_tokens * prices["input"] / 1_000_000 + output_tokens * prices["output"] / 1_000_000
        
        total_tokens = input_tokens + output_tokens
        self.session_tokens += total_tokens
        self.session_cost += cost
        
        entry = {
            "label": label,
            "input_tokens": input_tokens,
            "output_tokens": output_tokens,
            "total_tokens": total_tokens,
            "cost_usd": round(cost, 6),
            "cumulative_tokens": self.session_tokens,
            "cumulative_cost": round(self.session_cost, 6),
        }
        self.requests.append(entry)
        return entry
    
    def summary(self) -> str:
        budget_check = self.check_budget()
        return (
            f"Requests: {len(self.requests)} | "
            f"Tokens: {self.session_tokens:,}/{self.max_tokens_per_session:,} ({budget_check['session_usage_pct']:.1f}%) | "
            f"Costo: ${self.session_cost:.6f}/${self.max_cost_per_session} ({budget_check['cost_usage_pct']:.1f}%)"
        )


budget = TokenBudget(
    model=MODEL,
    max_tokens_per_request=4000,
    max_tokens_per_session=50000,
    max_cost_per_session=0.05,
)

print("TokenBudget inicializado:")
print(f"  Max tokens/request: {budget.max_tokens_per_request:,}")
print(f"  Max tokens/sesion: {budget.max_tokens_per_session:,}")
print(f"  Max costo/sesion: ${budget.max_cost_per_session}")

In [None]:
# ============================================================
# LLAMADAS CON PRESUPUESTO
# ============================================================

def llm_call_budgeted(prompt: str, budget: TokenBudget, label: str = "") -> dict:
    """Llamada a LLM con control de presupuesto."""
    # Check budget
    check = budget.check_budget()
    if not check["can_proceed"]:
        return {
            "status": "budget_exceeded",
            "content": f"Presupuesto agotado. Tokens restantes: {check['remaining_tokens']}, Costo restante: ${check['remaining_cost_usd']}",
        }
    
    response = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": "Responde en español, brevemente."},
            {"role": "user", "content": prompt},
        ],
        max_tokens=200,
    )
    
    # Record usage
    entry = budget.record(
        response.usage.prompt_tokens,
        response.usage.completion_tokens,
        label=label,
    )
    
    return {
        "status": "ok",
        "content": response.choices[0].message.content,
        **entry,
    }


# Test: 10 llamadas
print("=" * 60)
print("LLAMADAS CON PRESUPUESTO")
print("=" * 60)

preguntas = [
    "Que es Batman?",
    "Que es Spider-Man?",
    "Que es un agente de IA?",
    "Que es RAG?",
    "Que es LangGraph?",
    "Que es ChromaDB?",
    "Que es un embedding?",
    "Que es un transformer?",
    "Que es fine-tuning?",
    "Que es prompt engineering?",
]

for i, pregunta in enumerate(preguntas):
    result = llm_call_budgeted(pregunta, budget, label=f"q{i+1}")
    if result["status"] == "budget_exceeded":
        print(f"  [{i+1:2d}] PRESUPUESTO AGOTADO")
        break
    print(f"  [{i+1:2d}] {result['total_tokens']:4d} tokens | ${result['cost_usd']:.6f} | acum: ${result['cumulative_cost']:.6f}")

print(f"\n{budget.summary()}")

## 2. Estrategias de Optimizacion

In [None]:
# ============================================================
# ESTRATEGIAS DE OPTIMIZACION
# ============================================================

# 1. Cache simple (misma pregunta → misma respuesta)
response_cache: dict[str, str] = {}

def llm_call_cached(prompt: str) -> dict:
    """Llamada con cache."""
    if prompt in response_cache:
        return {"status": "cache_hit", "content": response_cache[prompt], "tokens": 0, "cost": 0}
    
    response = client.chat.completions.create(
        model=MODEL,
        messages=[{"role": "user", "content": prompt}],
        max_tokens=100,
    )
    content = response.choices[0].message.content
    response_cache[prompt] = content
    return {
        "status": "cache_miss",
        "content": content,
        "tokens": response.usage.total_tokens,
    }


# 2. Prompt corto vs largo
def comparar_prompts(pregunta: str) -> dict:
    """Compara prompts cortos vs largos."""
    # Prompt largo
    r_largo = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": "Eres un asistente experto en comics de superheroes. Tu especialidad incluye Batman de DC Comics y Spider-Man de Marvel Comics. Responde de forma detallada, citando arcos narrativos y proporcionando contexto historico cuando sea relevante. Responde siempre en español."},
            {"role": "user", "content": pregunta},
        ],
        max_tokens=200,
    )
    
    # Prompt corto
    r_corto = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": "Experto en comics. Responde en español."},
            {"role": "user", "content": pregunta},
        ],
        max_tokens=200,
    )
    
    return {
        "prompt_largo": {"tokens": r_largo.usage.total_tokens, "content": r_largo.choices[0].message.content[:100]},
        "prompt_corto": {"tokens": r_corto.usage.total_tokens, "content": r_corto.choices[0].message.content[:100]},
        "ahorro_tokens": r_largo.usage.total_tokens - r_corto.usage.total_tokens,
    }


# Test
print("=" * 60)
print("ESTRATEGIAS DE OPTIMIZACION")
print("=" * 60)

# Cache test
print("\n1. CACHE:")
for i in range(3):
    r = llm_call_cached("Que es Batman?")
    print(f"  Intento {i+1}: {r['status']:12s} | tokens: {r['tokens']}")

# Prompt comparison
print("\n2. PROMPT CORTO vs LARGO:")
comp = comparar_prompts("Quien es el Joker?")
print(f"  Largo: {comp['prompt_largo']['tokens']} tokens")
print(f"  Corto: {comp['prompt_corto']['tokens']} tokens")
print(f"  Ahorro: {comp['ahorro_tokens']} tokens ({comp['ahorro_tokens']/max(1,comp['prompt_largo']['tokens'])*100:.0f}%)")

## 3. Dashboard de Costos

In [None]:
# ============================================================
# DASHBOARD DE COSTOS
# ============================================================

fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# Panel 1: Tokens acumulados por request
tokens_acum = [r["cumulative_tokens"] for r in budget.requests]
axes[0].plot(range(1, len(tokens_acum) + 1), tokens_acum, "o-", color="#2196F3")
axes[0].axhline(budget.max_tokens_per_session, color="red", linestyle="--", label=f"Limite: {budget.max_tokens_per_session:,}")
axes[0].fill_between(range(1, len(tokens_acum) + 1), tokens_acum, alpha=0.1, color="#2196F3")
axes[0].set_xlabel("Request #")
axes[0].set_ylabel("Tokens acumulados")
axes[0].set_title("Consumo de Tokens")
axes[0].legend()

# Panel 2: Costo acumulado
costos_acum = [r["cumulative_cost"] for r in budget.requests]
axes[1].plot(range(1, len(costos_acum) + 1), costos_acum, "s-", color="#4CAF50")
axes[1].axhline(budget.max_cost_per_session, color="red", linestyle="--", label=f"Limite: ${budget.max_cost_per_session}")
axes[1].fill_between(range(1, len(costos_acum) + 1), costos_acum, alpha=0.1, color="#4CAF50")
axes[1].set_xlabel("Request #")
axes[1].set_ylabel("Costo acumulado (USD)")
axes[1].set_title("Costo Acumulado")
axes[1].legend()

# Panel 3: Tokens por request
tokens_per_req = [r["total_tokens"] for r in budget.requests]
axes[2].bar(range(1, len(tokens_per_req) + 1), tokens_per_req, color="#FF9800", alpha=0.8)
axes[2].axhline(np.mean(tokens_per_req), color="red", linestyle="--", label=f"Media: {np.mean(tokens_per_req):.0f}")
axes[2].set_xlabel("Request #")
axes[2].set_ylabel("Tokens")
axes[2].set_title("Tokens por Request")
axes[2].legend()

plt.tight_layout()
plt.savefig("../data/dashboard_costos.png", dpi=150, bbox_inches="tight")
plt.show()

print(f"Resumen final: {budget.summary()}")

## Takeaways

1. **TokenBudget** previene gastos descontrolados con limites por request y sesion
2. **Terminacion graceful**: el sistema avisa cuando el presupuesto se agota, no crashea
3. **Cache** es la optimizacion mas efectiva para preguntas repetitivas (0 tokens)
4. **Prompts cortos** ahorran tokens sin perder calidad significativa
5. **Dashboard** de costos es esencial para monitoreo en produccion
6. Regla practica: configurar el presupuesto en el **10% del maximo tolerable** como buffer de seguridad