# Agente de Precios – México
## FINAL v14 – HTML robusto (Bracket-extract `__PRELOADED_STATE__` + JSON-LD fallback) + Stats

Este v14 corrige el caso `offers_n=0` cuando el HTML no se puede `json.loads()` directo.

Orden de extracción:
1) `__PRELOADED_STATE__` (extraído por **balanceo de llaves** y luego parseado)
2) `application/ld+json` (JSON-LD) como respaldo

No depende de `/sites/MLM/search` ni de `/items`.


In [1]:

import re, json, time, statistics, requests
from dataclasses import dataclass
from typing import List, Optional, Dict, Any, Tuple


In [2]:

DEFAULT_HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36",
    "Accept-Language": "es-MX,es;q=0.9,en;q=0.8",
}
SESSION = requests.Session()
SESSION.headers.update(DEFAULT_HEADERS)


## Normalización + producto

In [3]:

def normalize_text(s: str) -> str:
    return re.sub(r"\s+", " ", s.lower().strip())

def normalize_model(s: str) -> str:
    return re.sub(r"[^a-z0-9]", "", normalize_text(s))

ACCESSORY_NEGATIVES = [
    "funda","case","carcasa","protector","mica","glass","templado","cable",
    "adaptador","cargador","base","soporte","refacción","repuesto","control",
    "almohadillas","earpads","estuche","solo caja"
]

@dataclass
class IdentifiedProduct:
    brand: Optional[str]
    model: Optional[str]
    model_norm: Optional[str]
    signature: str

def extract_product(description: str) -> IdentifiedProduct:
    d = normalize_text(description)
    brand = "sony" if " sony " in f" {d} " else None
    mm = re.search(r"\b([a-z]{1,4}\s*[-]?\s*\d{2,6}\s*[a-z]{0,6}\d*)\b", d)
    model = mm.group(1) if mm else None
    model_norm = normalize_model(model) if model else None
    signature = " ".join([x for x in [brand, model] if x]).strip() or description.strip()
    return IdentifiedProduct(brand, model, model_norm, signature)

def match_title(title: str, product: IdentifiedProduct) -> bool:
    t = normalize_text(title)
    if any(x in t for x in ACCESSORY_NEGATIVES):
        return False
    if product.model_norm:
        return product.model_norm in normalize_model(title)
    if product.brand:
        return product.brand in t
    return True

def listing_url(query: str) -> str:
    slug = re.sub(r"[^a-z0-9]+", "-", normalize_text(query)).strip("-")
    return f"https://listado.mercadolibre.com.mx/{slug}"


## Bracket extractor de `__PRELOADED_STATE__` (sin regex frágil)

In [4]:

def extract_js_object_by_brackets(text: str, start_idx: int) -> Optional[str]:
    i = start_idx
    if i < 0 or i >= len(text) or text[i] != "{":
        return None
    depth = 0
    in_str = False
    esc = False
    quote = ""
    for j in range(i, len(text)):
        ch = text[j]
        if in_str:
            if esc:
                esc = False
            elif ch == "\\":
                esc = True
            elif ch == quote:
                in_str = False
            continue
        else:
            if ch in ("'", '"'):
                in_str = True
                quote = ch
                continue
            if ch == "{":
                depth += 1
            elif ch == "}":
                depth -= 1
                if depth == 0:
                    return text[i:j+1]
    return None

def extract_preloaded_state(html: str) -> Optional[dict]:
    m = re.search(r"__PRELOADED_STATE__\s*=\s*", html)
    if not m:
        return None
    k = m.end()
    brace = html.find("{", k)
    if brace == -1:
        return None
    obj_str = extract_js_object_by_brackets(html, brace)
    if not obj_str:
        return None

    try:
        return json.loads(obj_str)
    except Exception:
        pass

    cleaned = obj_str
    cleaned = re.sub(r"\bundefined\b", "null", cleaned)
    cleaned = re.sub(r",\s*([}\]])", r"\1", cleaned)
    try:
        return json.loads(cleaned)
    except Exception:
        return None


## JSON-LD fallback (`application/ld+json`)

In [5]:

def extract_jsonld_nodes(html: str) -> List[Dict[str, Any]]:
    nodes = []
    for m in re.finditer(r'<script[^>]+type="application/ld\+json"[^>]*>(.*?)</script>', html, re.DOTALL | re.IGNORECASE):
        raw = m.group(1).strip()
        try:
            data = json.loads(raw)
        except Exception:
            continue
        stack = [data]
        while stack:
            x = stack.pop()
            if isinstance(x, dict):
                if ("name" in x or "title" in x) and ("offers" in x or "price" in x):
                    nodes.append(x)
                for v in x.values():
                    if isinstance(v, (dict, list)):
                        stack.append(v)
            elif isinstance(x, list):
                for v in x:
                    if isinstance(v, (dict, list)):
                        stack.append(v)
    return nodes


## Construcción de ofertas

In [6]:

@dataclass
class Offer:
    title: str
    price: float
    condition: str
    url: str
    item_id: str
    source: str

def offers_from_state(state: dict, product: IdentifiedProduct, limit: int = 150) -> List[Offer]:
    out: List[Offer] = []
    stack = [state]
    while stack and len(out) < limit:
        x = stack.pop()
        if isinstance(x, dict):
            title = x.get("title") or x.get("name")
            price = x.get("price")
            if isinstance(price, dict):
                price = price.get("amount") or price.get("value")
            url = x.get("permalink") or x.get("url") or ""
            item_id = x.get("id") or x.get("item_id") or ""

            if title and price is not None:
                try:
                    p = float(price)
                    if match_title(str(title), product):
                        out.append(Offer(
                            title=str(title),
                            price=p,
                            condition=str(x.get("condition") or "unknown"),
                            url=str(url),
                            item_id=str(item_id),
                            source="preloaded_state",
                        ))
                except Exception:
                    pass

            for v in x.values():
                if isinstance(v, (dict, list)):
                    stack.append(v)
        elif isinstance(x, list):
            for v in x:
                if isinstance(v, (dict, list)):
                    stack.append(v)
    return out

def offers_from_jsonld(nodes: List[Dict[str, Any]], product: IdentifiedProduct, limit: int = 150) -> List[Offer]:
    out: List[Offer] = []
    for node in nodes:
        title = node.get("name") or node.get("title")
        url = node.get("url") or ""
        offers = node.get("offers")
        cand_prices = []
        if isinstance(offers, dict):
            cand_prices.append(offers.get("price"))
            url = offers.get("url") or url
        elif isinstance(offers, list):
            for o in offers:
                if isinstance(o, dict):
                    cand_prices.append(o.get("price"))
                    if not url:
                        url = o.get("url") or url

        for pr in cand_prices:
            if title and pr is not None:
                try:
                    p = float(pr)
                    if match_title(str(title), product):
                        out.append(Offer(
                            title=str(title),
                            price=p,
                            condition="unknown",
                            url=str(url),
                            item_id="",
                            source="jsonld",
                        ))
                        if len(out) >= limit:
                            return out
                except Exception:
                    continue
    return out


## Estadística + outliers (IQR)

In [7]:

def percentile(values: List[float], p: float) -> float:
    xs = sorted(values)
    if len(xs) == 1:
        return xs[0]
    k = (len(xs) - 1) * p
    f = int(k)
    c = min(f + 1, len(xs) - 1)
    if f == c:
        return xs[f]
    return xs[f] + (k - f) * (xs[c] - xs[f])

def iqr_bounds(values: List[float]) -> Tuple[float, float, float, float]:
    q1 = percentile(values, 0.25)
    q3 = percentile(values, 0.75)
    iqr = q3 - q1
    return q1, q3, q1 - 1.5*iqr, q3 + 1.5*iqr

def basic_stats(values: List[float]) -> Dict[str, Any]:
    xs = sorted(values)
    return {"n": len(xs), "min": xs[0], "max": xs[-1], "mean": sum(xs)/len(xs), "median": statistics.median(xs)}

def summarize_offers(offers: List[Offer]) -> Dict[str, Any]:
    if not offers:
        return {"new": None, "used": None, "unknown": None}
    groups = {"new": [], "used": [], "unknown": []}
    for o in offers:
        c = o.condition if o.condition in groups else "unknown"
        groups[c].append(o)

    out = {}
    for cond, offs in groups.items():
        prices = [o.price for o in offs]
        if not prices:
            out[cond] = None
            continue
        stats_all = basic_stats(prices)
        bounds = None
        inliers = offs
        if len(prices) >= 4:
            q1,q3,low,high = iqr_bounds(prices)
            bounds = {"q1": q1, "q3": q3, "low": low, "high": high}
            inliers = [o for o in offs if low <= o.price <= high]
        stats_inliers = basic_stats([o.price for o in inliers]) if (bounds and inliers) else None
        out[cond] = {"stats_all": stats_all, "iqr_bounds": bounds, "stats_inliers": stats_inliers}
    return out


## Agente FINAL (PRELOADED_STATE → JSON-LD)

In [8]:

def price_agent(description: str, max_offers: int = 25) -> Dict[str, Any]:
    product = extract_product(description)
    url = listing_url(product.signature)
    html = SESSION.get(url, timeout=25).text

    offers: List[Offer] = []
    strategy = "none"

    state = extract_preloaded_state(html)
    if isinstance(state, dict):
        offers = offers_from_state(state, product, limit=max_offers*6)
        strategy = "preloaded_state"

    if not offers:
        nodes = extract_jsonld_nodes(html)
        offers = offers_from_jsonld(nodes, product, limit=max_offers*6)
        strategy = "jsonld" if offers else "no_offers"

    offers = offers[:max_offers]
    summary = summarize_offers(offers)

    return {
        "identified_product": product.__dict__,
        "strategy": strategy,
        "listing_url": url,
        "offers_n": len(offers),
        "summary": summary,
        "offers": [o.__dict__ for o in offers],
        "timestamp": time.strftime("%Y-%m-%dT%H:%M:%S%z"),
    }

res = price_agent("Sony WH-1000XM5 audífonos", max_offers=25)
print("strategy:", res["strategy"])
print("identified_product:", res["identified_product"])
print("offers_n:", res["offers_n"])
print("stats_inliers:", json.dumps({k: (v and v.get('stats_inliers')) for k,v in res["summary"].items()}, ensure_ascii=False, indent=2))
print("\nMuestra ofertas:")
for o in res["offers"][:8]:
    print("-", o["price"], f"({o['source']})", "|", o["title"][:70])


strategy: jsonld
identified_product: {'brand': 'sony', 'model': 'wh-1000xm5', 'model_norm': 'wh1000xm5', 'signature': 'sony wh-1000xm5'}
offers_n: 22
stats_inliers: {
  "new": null,
  "used": null,
  "unknown": {
    "n": 20,
    "min": 1605.38,
    "max": 9008.94,
    "mean": 5601.116499999999,
    "median": 5010.16
  }
}

Muestra ofertas:
- 6642.51 (jsonld) | Sony Wh-1000xm5 Audifonos Inalambricos - Color Azul
- 228.39 (jsonld) | Pares De Reemplazo Para Auriculares Sony Wh-1000xm5: Auricul
- 231.27 (jsonld) | Caja Almacenamiento Para Auriculares Para Sony Wh-1000xm5
- 9008.94 (jsonld) | Audífonos Sony Wh-1000xm5 Noise Cancelling Sonido Envolvente Negro
- 1605.38 (jsonld) | Auriculares Sony Wh-1000xm5 Premium Con Cancelación De Ruido
- 6839.32 (jsonld) | Auriculares Sony Wh1000xm5/s Inalámbricos Con Cancelación De
- 7990.0 (jsonld) | Audífonos Inalámbricos Sony Wh-1000xm5 Color Negro Negro
- 7799.0 (jsonld) | Audífonos Inalámbricos Sony 1000x Series Wh-1000xm5 Negro Negro
