# SinIntermediarios — Daily Scrape (Bronze)

**Purpose**
- Read the URL catalog from `MAIN_TABLE`.
- Scrape prices (regular / promo / membership when available) + status, stock, image.
- Write an **append-only daily snapshot** to `SCRAPE_TABLE` partitioned by `scrape_date`.

**Outputs**
- `SCRAPE_TABLE` contains one row per (scrape_date, url) with the latest scrape result.

In [0]:
# 0) (Optional) Dependencies
# In Databricks notebooks, prefer %pip so packages persist on the cluster.
# Uncomment if needed.
# %pip install -q beautifulsoup4 lxml requests pandas

In [0]:
%pip install beautifulsoup4 lxml
dbutils.library.restartPython()

In [0]:
# 1) CONFIG
from pyspark.sql import functions as F

MAIN_TABLE  = "workspace.sinintermediarios.main_file"
SCRAPE_TABLE = "workspace.sinintermediarios.bronze_scrape_daily"

INPUT_LIMIT = None  # int for testing
REQUEST_SLEEP_SECONDS = 0.25

# For safety: parallelize by SITE, not by URL (avoids hammering a single domain).
MAX_WORKERS = 6

# If you re-run in the same day, the notebook will de-dup per (scrape_date, url) before writing.

In [0]:
# 2) LOAD INPUT URLS
from pyspark.sql import functions as F

main_df = spark.table(MAIN_TABLE)

cols_lc = {c.lower(): c for c in main_df.columns}
url_col = cols_lc.get('url')
site_col = cols_lc.get('comercio')

if not url_col:
    raise ValueError(f"MAIN_TABLE must contain 'url'. Found: {main_df.columns}")
if not site_col:
    raise ValueError(f"MAIN_TABLE must contain 'site' or 'comercio'. Found: {main_df.columns}")

input_df = (
    main_df
      .select(
          F.trim(F.col(site_col)).alias('site_raw'),
          F.trim(F.col(url_col)).alias('url_raw')
      )
      .filter(F.col('url_raw').isNotNull() & (F.length('url_raw') > 0))
      .dropDuplicates(['url_raw'])
)

if INPUT_LIMIT:
    input_df = input_df.limit(int(INPUT_LIMIT))

display(input_df.limit(50))
print('Rows to scrape:', input_df.count())

In [0]:
# Databricks notebook source
# --------------------------------------------------------------------------------------
# Helpers: HTTP, parsing, normalization
# --------------------------------------------------------------------------------------
import datetime
import json
import re
import time
from dataclasses import dataclass
from typing import Any, Dict, Optional
from urllib.parse import urlparse, parse_qs

import requests
from bs4 import BeautifulSoup

DEFAULT_HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "es-CO,es;q=0.9",
    "Referer": "https://www.google.com/",
    "Connection": "keep-alive",
}


def soup(html: str) -> BeautifulSoup:
    return BeautifulSoup(html or "", "lxml")


@dataclass
class FetchResult:
    status: str
    error: str
    http_status: int
    elapsed_ms: int
    html: str

    @property
    def status_code(self) -> int:
        return self.http_status

    @property
    def text(self) -> str:
        return self.html


class HttpClient:
    """Tiny wrapper to keep scraper code consistent and add minimal resiliency."""

    def __init__(self, headers: Optional[dict] = None):
        self.session = requests.Session()
        self.session.headers.update(headers or DEFAULT_HEADERS)

    def get(self, url: str, timeout_s: int = 25, retries: int = 2, backoff_s: int = 3) -> FetchResult:
        t0 = time.time()
        last_error = ""
        last_status = 0
        for attempt in range(retries + 1):
            try:
                r = self.session.get(url, timeout=timeout_s, allow_redirects=True)
                if r.status_code == 200:
                    return FetchResult(
                        status="ok",
                        error="",
                        http_status=int(r.status_code),
                        elapsed_ms=int((time.time() - t0) * 1000),
                        html=r.text or "",
                    )
                last_status = int(r.status_code)
                last_error = f"HTTP {r.status_code}"
            except Exception as e:
                last_error = f"{type(e).__name__}: {e}"
                last_status = 0
            if attempt < retries:
                time.sleep(backoff_s * (attempt + 1))
        return FetchResult(
            status="exception" if last_status == 0 else "http_error",
            error=last_error,
            http_status=last_status,
            elapsed_ms=int((time.time() - t0) * 1000),
            html="",
        )


HTTP = HttpClient()


def default_result(site: Optional[str] = None, url: Optional[str] = None) -> Dict[str, Any]:
    """Canonical result shape for ALL scrapers."""
    return {
        "site": site,
        "url": url,
        "status": "ok",
        "error": None,
        "http_status": None,
        "elapsed_ms": None,
        "scraped_at": None,
        # raw prices (floats) - normalized later
        "price": 0.0,          # regular / single purchase
        "promo_price": 0.0,    # discounted
        "membership": 0.0,     # subscription / membership
        "has_stock": None,
        "stock": None,
        "name_scraped": None,
        "image_url": None,
    }

# ---------- Site normalization ----------
def normalize_site(site_raw: str) -> str:
    if not site_raw:
        return None
    s = site_raw.strip().lower()

    if "mercado" in s:
        return "mercadolibre"
    if "sin intermediarios" in s:
        return "sinintermediarios"
    if "farmatodo" in s:
        return "farmatodo"
    if "vitanas" in s:
        return "vitanas"
    if "nutramerican" in s:
        return "nutramerican"
    if "zonafit" in s or "zona fit" in s:
        return "zonafit"
    if "proscience" in s:
        return "proscience"
    if "herbivore" in s:
        return "herbivore"
    if "colsubsidio" in s:
        return "colsubsidio"
    if "savvy" in s:
        return "savvy"

    return s.replace(" ", "")

def extract_og_image_url(sp: BeautifulSoup) -> str:
    for sel in [
        'meta[property="og:image"]',
        'meta[name="og:image"]',
        'meta[name="twitter:image"]',
        'meta[property="twitter:image"]',
    ]:
        tag = sp.select_one(sel)
        if tag and tag.get("content"):
            u = tag["content"].strip()
            if u.startswith("//"):
                u = "https:" + u
            return u
    return ""


def normalize_site(site_raw: Optional[str]) -> str:
    if site_raw is None:
        return ""
    s = str(site_raw).strip().lower()
    s = s.replace("_", " ").replace("-", " ")
    s = re.sub(r"\s+", " ", s).strip()

    mapping = {
        "savvy": "savvy",
        "mercado libre": "mercadolibre",
        "proscience": "proscience",
        "zona fit": "zonafit",
        "sin intermediarios": "sinintermediarios",
        "vitanas": "vitanas",
        "farmatodo": "farmatodo",
        "herbivore": "herbivore",
        "nutramerican": "nutramerican",
        "colsubsidio": "colsubsidio",
        "muscletech": "muscletech",
    }
    return mapping.get(s, s.replace(" ", ""))


def clean_url(url: Optional[str]) -> str:
    if url is None:
        return ""
    u = str(url).strip()
    if u.lower() in ("nan", "none", ""):
        return ""
    return u.split("#")[0].strip()


def parse_price_any(val: Any) -> float:
    """Parse common price strings into a float.

    Handles:
      - numbers (int/float)
      - COP strings with thousand separators: "$174.100" / "$195,900" / "174.100,00"
      - Shopify JSON strings: "164990.00"
    """
    if val is None:
        return 0.0
    if isinstance(val, (int, float)):
        try:
            return float(val)
        except Exception:
            return 0.0

    s = str(val).strip()
    if s == "":
        return 0.0

    # Keep only digits + separators
    s2 = re.sub(r"[^0-9\,\.]", "", s)
    if s2 == "":
        return 0.0

    # If it looks like COP with thousand separators and optional cents, normalize to pesos
    # e.g., "174.100" or "174.100,00" or "195,900"
    has_cents = bool(re.search(r"[\,\.]\d{2}$", s2))

    # Determine decimal separator if both present
    if "," in s2 and "." in s2:
        # decimal separator is the last occurring
        if s2.rfind(",") > s2.rfind("."):
            # "." thousand, "," decimal
            s2 = s2.replace(".", "").replace(",", ".")
        else:
            # "," thousand, "." decimal
            s2 = s2.replace(",", "")
    elif "," in s2 and "." not in s2:
        # Could be thousand or decimal. If ends with 2 decimals -> decimal; else thousand.
        if has_cents:
            s2 = s2.replace(",", ".")
        else:
            s2 = s2.replace(",", "")
    # else: only '.' or none -> float will handle

    try:
        x = float(s2)
    except Exception:
        return 0.0

    # If cents were explicitly present, convert to pesos by flooring cents.
    # (For COP, cents are not used in practice; Shopify may emit ".00".)
    if has_cents:
        x = float(int(x))

    return x

def parse_price_cop(val):
    # Alias consistente para COP: devuelve pesos ENTEROS (sin decimales)
    return float(int(parse_price_any(val)))


def cop_thousands_fix(x: float) -> float:
    """Heuristic: COP prices rarely in 1..999 range; if they are, it's often missing thousands."""
    try:
        v = float(x or 0.0)
    except Exception:
        return 0.0
    if 1 < v < 1000:
        return v * 1000
    return v

def http_get(url: str, headers: dict | None = None):
    """
    Your HttpClient wrapper uses: HTTP.get(url, extra_headers=...)
    """
    return HTTP.get(url, extra_headers=(headers or {}))


In [0]:
# --------------------------------------------------------------------------------------
# Scrapers
# --------------------------------------------------------------------------------------

# --- Generic Shopify / WooCommerce fallbacks (used for some sites) ---

def scrape_shopify_basic(url: str, site_key: str) -> dict:
    out = default_result(site_key, url)
    fr = HTTP.get(url)
    out["http_status"], out["elapsed_ms"] = fr.status_code, fr.elapsed_ms

    if fr.error:
        out["status"], out["error"] = "exception", fr.error
        return out
    if fr.status_code >= 400:
        out["status"], out["error"] = "http_error", f"http_status={fr.status_code}"
        return out

    sp = soup(fr.text)
    out["image_url"] = extract_og_image_url(sp)

    # JSON-LD offers.price
    price = 0.0
    for tag in sp.find_all("script", attrs={"type": "application/ld+json"}):
        try:
            data = json.loads(tag.get_text(strip=True) or "{}")
            candidates = data if isinstance(data, list) else [data]
            for d in candidates:
                offers = d.get("offers")
                if isinstance(offers, dict) and offers.get("price") is not None:
                    price = parse_price_any(offers.get("price"))
                    break
                if isinstance(offers, list):
                    for o in offers:
                        if isinstance(o, dict) and o.get("price") is not None:
                            price = parse_price_any(o.get("price"))
                            break
                if price:
                    break
        except Exception:
            pass
        if price:
            break

    if not price:
        meta = sp.find("meta", attrs={"property": "product:price:amount"})
        if meta and meta.get("content"):
            price = parse_price_any(meta.get("content"))

    out["price"] = float(price or 0.0)
    out["status"] = "ok" if out["price"] > 0 else "no_price"
    return out


def scrape_woocommerce_basic(url: str, site_key: str) -> dict:
    out = default_result(site_key, url)
    fr = HTTP.get(url)
    out["http_status"], out["elapsed_ms"] = fr.status_code, fr.elapsed_ms

    if fr.error:
        out["status"], out["error"] = "exception", fr.error
        return out
    if fr.status_code >= 400:
        out["status"], out["error"] = "http_error", f"http_status={fr.status_code}"
        return out

    sp = soup(fr.text)

    price = 0.0
    price_block = sp.select_one("p.price bdi") or sp.select_one("span.woocommerce-Price-amount bdi")
    if price_block:
        price = parse_price_any(price_block.get_text(" ", strip=True))

    promo = 0.0
    promo_block = sp.select_one("p.price ins bdi")
    if promo_block:
        promo = parse_price_any(promo_block.get_text(" ", strip=True))

    out["price"] = float(price or 0.0)
    out["promo_price"] = float(promo or 0.0)
    out["status"] = "ok" if (out["price"] > 0 or out["promo_price"] > 0) else "no_price"
    return out


# --- Site-specific scrapers (ported + cleaned from original notebook) ---

_MONEY_RE = re.compile(r"\$\s*([0-9][0-9\.,]*)")

def _to_cop_int(num_str: str) -> int:
    if not num_str:
        return 0
    s = num_str.strip().replace(".", "").replace(",", "")
    try:
        return int(s)
    except Exception:
        return 0


def _extract_prices_from_html(html: str) -> list[int]:
    vals = []
    for m in _MONEY_RE.findall(html or ""):
        v = _to_cop_int(m)
        if v > 0:
            vals.append(v)
    return sorted(set(vals))


import re
import requests
from urllib.parse import urlparse, parse_qs

def _extract_dp_tax_percent(html: str) -> int | None:
    """
    Sinintermediarios uses a DP (dual price) script that declares:
      dp_tax_percent = '10'
    We use this to compute member price = no_member_price / (1 + tax_percent/100)
    """
    if not html:
        return None

    # Accept: dp_tax_percent = '10'  OR  dp_tax_percent='10'
    m = re.search(r"dp_tax_percent\s*=\s*['\"](\d{1,2})['\"]", html, flags=re.IGNORECASE)
    if not m:
        return None

    try:
        return int(m.group(1))
    except Exception:
        return None


import re
import requests
from urllib.parse import urlparse, parse_qs

def _extract_dp_tax_percent(html: str) -> int | None:
    """
    Sinintermediarios dual pricing snippet declares something like:
      dp_tax_percent = '10'
    We use it to compute:
      member_price = full_price / (1 + tax/100)
    """
    if not html:
        return None
    m = re.search(r"dp_tax_percent\s*=\s*['\"](\d{1,2})['\"]", html, flags=re.IGNORECASE)
    if not m:
        return None
    try:
        return int(m.group(1))
    except Exception:
        return None


def scrape_sinintermediarios(url: str, site_key: str = "sinintermediarios") -> dict:
    """
    Shopify (sinintermediarios.co)

    Outputs (as floats for compatibility with your pipeline):
      - out["price"]      = No Miembro (full) in COP pesos
      - out["membership"] = Miembro in COP pesos (derived via dp_tax_percent, integer math)
      - out["promo_price"]= 0.0
      - out["image_url"]  = best-effort variant-specific image URL
      - out["has_stock"]  = variant availability
      - out["status"]     = ok / no_price / http_error / exception
    """
    out = default_result(site_key, url)

    try:
        session = requests.Session()
        session.headers.update(DEFAULT_HEADERS)

        # -----------------------------
        # 1) Parse variant from URL
        # -----------------------------
        parsed = urlparse(url)
        qs = parse_qs(parsed.query)
        variant_id = qs.get("variant", [None])[0]
        variant_id = int(variant_id) if variant_id else None

        # -----------------------------
        # 2) Fetch Shopify product JSON (.js)
        # -----------------------------
        base = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
        js_url = base.rstrip("/") + ".js"

        r_js = session.get(js_url, timeout=20)
        out["http_status"] = int(r_js.status_code)
        if r_js.status_code != 200:
            out["status"], out["error"] = "http_error", f"js_http_{r_js.status_code}"
            return out

        product = r_js.json()

        # pick selected variant
        selected = None
        for v in (product.get("variants") or []):
            if variant_id and int(v.get("id", 0)) == variant_id:
                selected = v
                break
        if not selected:
            selected = (product.get("variants") or [None])[0]

        if not selected:
            out["status"], out["error"] = "no_price", "no_variant"
            return out

        # Shopify stores price in cents -> COP pesos
        raw_cents = int(selected.get("price", 0) or 0)
        full_pesos = raw_cents // 100  # IMPORTANT

        out["has_stock"] = bool(selected.get("available", True))
        out["name_scraped"] = product.get("title")

        # -----------------------------
        # 3) Image URL (prefer variant-size image)
        # -----------------------------
        image_url = None

        # 3.1 variant featured_image if present
        fi = selected.get("featured_image")
        if isinstance(fi, dict):
            image_url = fi.get("src")

        # 3.2 try match images[] by option2 size (1LB/2LB/5LB encoded in filenames)
        if not image_url:
            imgs = product.get("images") or []
            opt2 = (selected.get("option2") or "").lower()  # e.g., "5 libras (2280g)"
            kw = []
            if "1 libra" in opt2:
                kw = ["1lb", "1-lb", "1_lb"]
            elif "2 libras" in opt2:
                kw = ["2lb", "2-lb", "2_lb"]
            elif "5 libras" in opt2:
                kw = ["5lb", "5-lb", "5_lb"]

            if kw and imgs:
                for im in imgs:
                    s = (im or "").lower()
                    if any(k in s for k in kw):
                        image_url = im
                        break

        # 3.3 fallback to featured_image
        if not image_url:
            image_url = product.get("featured_image")

        if image_url and isinstance(image_url, str) and image_url.startswith("//"):
            image_url = "https:" + image_url

        out["image_url"] = image_url

        # -----------------------------
        # 4) Membership price (Miembro)
        #    Derived from dp_tax_percent in HTML using integer math (NO FLOATS)
        # -----------------------------
        membership_pesos = 0
        tax_percent = None

        try:
            r_html = session.get(url, timeout=20)
            if r_html.status_code == 200:
                tax_percent = _extract_dp_tax_percent(r_html.text)
        except Exception:
            tax_percent = None

        if tax_percent and tax_percent > 0 and full_pesos > 0:
            # exact integer math:
            # member = full * 100 / (100 + tax)
            denom = 100 + int(tax_percent)
            membership_pesos = (full_pesos * 100) // denom

        # -----------------------------
        # 5) Output contract
        # -----------------------------
        out["price"] = float(full_pesos)                 # No Miembro
        out["membership"] = float(membership_pesos)      # Miembro
        out["promo_price"] = 0.0

        out["status"] = "ok" if (full_pesos > 0 or membership_pesos > 0) else "no_price"
        return out

    except Exception as e:
        out["status"], out["error"] = "exception", f"{type(e).__name__}: {e}"
        return out

def _extract_first_text(html: str, pattern: str):
    """Small helper: returns first regex group match or None."""
    import re
    try:
        m = re.search(pattern, html, flags=re.IGNORECASE | re.DOTALL)
        return m.group(1).strip() if m else None
    except Exception:
        return None

#####FARMATODO#########

# # --- Farmatodo (API-first) ---

# ##### FARMATODO (API seed URL strategy; compatible with your HttpClient) #####

import re
import json
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse

# 1) Pega aquí TU URL completa (la del curl) tal cual (sin el "curl ...", solo la URL)
FARMATODO_GETITEM_SEED_URL = (
    "https://gw-backend.farmatodo.com/_ah/api/productEndpoint/v2/getItem"
    "?source=WEB&idItem=269800668"
    "&idCustomerWebSafe=ahZzfnN0dW5uaW5nLWJhc2UtMTY0NDAyci4LEgRVc2VyIiQ3ODY4NDA0OC1mOTVmLTRkMjktODliMy0yMzlmYmRmYTFhYTIM"
    "&idStoreGroup=26"
    "&nearbyStores=26,20,67,3,85,24,31,88,81,83,89,15,54"
    "&token=01e0d8b6fd85ad7a5c7120f56b901480"
    "&tokenIdWebSafe=ahZzfnN0dW5uaW5nLWJhc2UtMTY0NDAycl0LEgRVc2VyIiQ3ODY4NDA0OC1mOTVmLTRkMjktODliMy0yMzlmYmRmYTFhYTIMCxIFVG9rZW4iJDRjNTM1YjgyLWU2NDQtNDcxNS1hMWQyLWEzZjgyYjE2MDYzMAw"
    "&key=AIzaSyAidR6Tt0K60gACR78aWThMQb7L5u6Wpag"
    "&deliveryType=EXPRESS&storeId=26&city=BOG&isShoppingCart=false&customerId=undefined"
)

def _extract_farmatodo_item_id(url: str) -> str | None:
    m = re.search(r"/producto/(\d+)", url)
    return m.group(1) if m else None

def _replace_query_param(url: str, key: str, value: str) -> str:
    p = urlparse(url)
    q = parse_qs(p.query)
    q[key] = [value]
    new_query = urlencode(q, doseq=True)
    return urlunparse((p.scheme, p.netloc, p.path, p.params, new_query, p.fragment))

def _pick_product_by_id(getitem_json: dict, id_item: str) -> dict | None:
    item_sections = getitem_json.get("itemSection") or []
    for sec in item_sections:
        for lst in (sec.get("list") or []):
            for prod in (lst.get("product") or []):
                if str(prod.get("id")) == str(id_item):
                    return prod
    return None

def _norm_img(url: str | None) -> str | None:
    if not url:
        return None
    if url.startswith("//"):
        return "https:" + url
    return url

def scrape_farmatodo(url: str, site_key: str = "farmatodo") -> dict:
    out = default_result(site_key, url)

    try:
        id_item = _extract_farmatodo_item_id(url)
        if not id_item:
            out["status"], out["error"] = "invalid_input", "missing_idItem_in_url"
            return out

        # Build API URL from seed (swap idItem only)
        getitem_url = _replace_query_param(FARMATODO_GETITEM_SEED_URL, "idItem", id_item)

        # --- TEMPORARY session headers (since HttpClient.get() doesn't accept headers) ---
        old_headers = dict(HTTP.session.headers)

        # Minimal headers that commonly matter for this endpoint
        HTTP.session.headers.update({
            "Accept": "application/json, text/plain, */*",
            "Content-Type": "application/json",
            "source": "WEB",
            "country": "COL",
            "device-id": "ANONIMO",
            # Referer/origin sometimes help; keep them stable
            "origin": "https://www.farmatodo.com.co",
            "referer": url,
        })

        fr_api = HTTP.get(getitem_url)

        # restore headers no matter what
        HTTP.session.headers.clear()
        HTTP.session.headers.update(old_headers)

        out["http_status"], out["elapsed_ms"] = fr_api.status_code, fr_api.elapsed_ms

        if fr_api.error:
            out["status"], out["error"] = "exception", fr_api.error
            return out
        if fr_api.status_code >= 400:
            out["status"], out["error"] = "http_error", f"getitem_http_{fr_api.status_code}"
            return out

        try:
            data = json.loads((fr_api.text or "").strip())
        except Exception as e:
            out["status"], out["error"] = "exception", f"failed_to_parse_getitem_json: {e}"
            return out

        prod = _pick_product_by_id(data, id_item)
        if not prod:
            # If we didn't find the id, don't accept a random fallback product (this is the "same price for all" symptom)
            out["status"] = "no_price"
            out["error"] = "getitem_id_not_found_in_response"
            return out

        # API returns numeric COP values (pesos)
        full_price  = int(float(prod.get("fullPrice") or 0))
        offer_price = int(float(prod.get("offerPrice") or 0))
        prime_price = int(float(prod.get("primePrice") or 0))

        promo = offer_price if (offer_price > 0 and full_price > 0 and offer_price < full_price) else 0
        membership = prime_price if prime_price > 0 else 0

        imgs = prod.get("listUrlImages") or []
        image_url = _norm_img(imgs[0]) if imgs else _norm_img(prod.get("mediaImageUrl"))

        out["price"] = float(full_price or 0)
        out["promo_price"] = float(promo or 0)
        out["membership"] = float(membership or 0)

        out["name_scraped"] = prod.get("description") or prod.get("name")
        out["image_url"] = image_url
        out["image"] = image_url or ""

        out["has_stock"] = bool(prod.get("hasStock", True))
        out["stock"] = 0

        out["status"] = "ok" if (out["price"] > 0 or out["promo_price"] > 0 or out["membership"] > 0) else "no_price"
        return out

    except Exception as e:
        out["status"], out["error"] = "exception", f"{type(e).__name__}: {e}"
        return out



# --- Muscletech (WooCommerce / WordPress) ---

def parse_price_cop_thousands(x) -> float:
    """
    COP helper:
    - "160.650" -> 160650
    - "$ 178.500" -> 178500
    - "20780000" (rare) -> 20780000
    """
    if x is None:
        return 0.0
    s = str(x)
    s = s.replace("\xa0", " ").strip()

    # keep digits + separators
    s_clean = re.sub(r"[^\d\.,]", "", s_clean := s)

    if not s_clean:
        return 0.0

    # If contains BOTH, use your existing heuristic
    if "," in s_clean and "." in s_clean:
        # Example: "1.234,56" or "1,234.56"
        if s_clean.find(",") < s_clean.find("."):
            s_clean = s_clean.replace(",", "")
        else:
            s_clean = s_clean.replace(".", "").replace(",", ".")
        try:
            return float(s_clean)
        except:
            return 0.0

    # Only comma
    if "," in s_clean and "." not in s_clean:
        # if looks like thousands "123,456" => remove commas
        if re.search(r",\d{3}($|[^\d])", s_clean):
            s_clean = s_clean.replace(",", "")
        else:
            s_clean = s_clean.replace(",", ".")
        try:
            return float(s_clean)
        except:
            return 0.0

    # Only dot
    if "." in s_clean and "," not in s_clean:
        # if looks like thousands "160.650" or "1.234.567" => remove dots
        if re.search(r"\.\d{3}($|[^\d])", s_clean) or s_clean.count(".") >= 2:
            s_clean = s_clean.replace(".", "")
            try:
                return float(s_clean)
            except:
                return 0.0

        # otherwise treat dot as decimal
        try:
            return float(s_clean)
        except:
            return 0.0

    # Only digits
    try:
        return float(s_clean)
    except:
        return 0.0


def scrape_muscletech(url: str, site_key: str = "muscletech") -> dict:
    out = default_result(site_key, url)

    fr = HTTP.get(url)
    out["http_status"], out["elapsed_ms"] = fr.status_code, fr.elapsed_ms

    if fr.error:
        out["status"], out["error"] = "exception", fr.error
        return out
    if fr.status_code >= 400:
        out["status"], out["error"] = "http_error", f"http_status={fr.status_code}"
        return out

    html = fr.text
    sp = soup(html)

    out["image_url"] = extract_og_image_url(sp)

    # WooCommerce usual pattern:
    # <p class="price"><del>...178.500...</del> <ins>...160.650...</ins></p>
    sale_el = sp.select_one("p.price ins .woocommerce-Price-amount, p.price ins bdi")
    reg_el  = sp.select_one("p.price del .woocommerce-Price-amount, p.price del bdi")

    sale_txt = sale_el.get_text(" ", strip=True) if sale_el else ""
    reg_txt  = reg_el.get_text(" ", strip=True) if reg_el else ""

    sale_price = parse_price_cop_thousands(sale_txt)
    reg_price  = parse_price_cop_thousands(reg_txt)

    # fallback: sometimes there's only one visible price
    if reg_price <= 0:
        any_price_el = sp.select_one("p.price .woocommerce-Price-amount, p.price bdi")
        any_txt = any_price_el.get_text(" ", strip=True) if any_price_el else ""
        reg_price = parse_price_cop_thousands(any_txt)

    # If sale exists but reg missing, treat reg= sale
    if reg_price <= 0 and sale_price > 0:
        reg_price = sale_price

    # stock heuristic
    body_classes = (sp.body.get("class", []) if sp.body else [])
    add_btn = sp.select_one("button.single_add_to_cart_button")
    out["has_stock"] = bool(add_btn) and ("outofstock" not in body_classes)

    out["price"] = float(reg_price or 0.0)          # full/regular
    out["promo_price"] = float(sale_price or 0.0)   # discount/sale
    out["membership"] = 0.0

    out["status"] = "ok" if (out["price"] > 0 or out["promo_price"] > 0) else "no_price"
    return out


# --- Savvy (FIX: COP thousands parsing + subscription membership) ---

def parse_price_cop_strict(text: str) -> float:
    """
    COP parser:
    - '$207.800' => 207800
    - '$166.240' => 166240
    - '$207,800' => 207800
    - '207800'   => 207800
    """
    if not text:
        return 0.0

    s = str(text)
    # keep digits + separators only
    s = re.sub(r"[^\d\.,]", "", s).strip()
    if not s:
        return 0.0

    # If both separators exist, assume:
    # - '.' thousands and ',' decimals OR vice versa. For COP we want integer pesos.
    if "." in s and "," in s:
        # choose last separator as decimal separator; remove the other as thousands
        if s.rfind(".") > s.rfind(","):
            # '.' is decimal, ',' thousands
            s = s.replace(",", "")
        else:
            # ',' is decimal, '.' thousands
            s = s.replace(".", "").replace(",", ".")
        try:
            return float(int(float(s)))
        except:
            return 0.0

    # Only dot: treat as thousands separator (COP formatting)
    if "." in s and "," not in s:
        s2 = s.replace(".", "")
        return float(int(s2)) if s2.isdigit() else 0.0

    # Only comma: could be thousands or decimal; for COP usually thousands in LATAM UI
    if "," in s and "." not in s:
        s2 = s.replace(",", "")
        return float(int(s2)) if s2.isdigit() else 0.0

    # plain digits
    return float(int(s)) if s.isdigit() else 0.0


def _first_price_by_ids_cop(sp: BeautifulSoup, selectors: list[str]) -> float:
    for sel in selectors:
        el = sp.select_one(sel)
        if el:
            p = parse_price_cop_strict(el.get_text(" ", strip=True))
            if p > 0:
                return p
    return 0.0


def scrape_savvy(url: str, site_key: str = "savvy") -> dict:
    """
    Savvy is Shopify.
    - price_full: from product.js (cents -> COP pesos)
    - membership: subscription price (if present) from HTML IDs
    """
    out = default_result(site_key, url)

    try:
        session = requests.Session()
        session.headers.update(DEFAULT_HEADERS)

        # 1) Always pull product.js (truth for full price in cents)
        parsed = urlparse(url)
        base = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
        js_url = base.rstrip("/") + ".js"

        r_js = session.get(js_url, timeout=20)
        out["http_status"] = int(r_js.status_code)
        if r_js.status_code != 200:
            out["status"], out["error"] = "http_error", f"js_http_{r_js.status_code}"
            return out

        product = r_js.json()

        # pick first variant (Savvy often uses Default Title)
        variants = product.get("variants") or []
        v = variants[0] if variants else None
        if not v:
            out["status"], out["error"] = "no_price", "no_variant"
            return out

        # Shopify stores price in cents -> COP pesos
        price_full = int(v.get("price", 0)) // 100
        out["has_stock"] = bool(v.get("available", True))

        # image
        img = product.get("featured_image")
        if isinstance(img, str) and img.startswith("//"):
            img = "https:" + img
        out["image_url"] = img

        # 2) HTML to get subscription (membership) price (if shown)
        r_html = session.get(url, timeout=20)
        if r_html.status_code == 200:
            sp = soup(r_html.text)

            # these ids exist in your SAVVYHTML (confirmed)
            membership = _first_price_by_ids_cop(sp, [
                "#savvy-prime-precio-final-product-page",
                "#savvy-prime-precio-final1-product-page",
                "#savvy-prime-precio-final2-product-page",
                "#savvy-prime-precio-final3-product-page",
            ])

            # fallback: infer from tokens inside the same container
            if membership <= 0:
                block = re.search(
                    r'id="precios-savvy-prime-container-product-page".{0,8000}?</div>',
                    r_html.text, re.IGNORECASE | re.DOTALL
                )
                html_block = block.group(0) if block else r_html.text
                tokens = re.findall(r"\$\s*[\d\.\,]+", html_block)
                vals = sorted({int(parse_price_cop_strict(t)) for t in tokens if parse_price_cop_strict(t) >= 10000})
                if len(vals) >= 2:
                    membership = float(min(vals))

            out["membership"] = float(membership or 0.0)
        else:
            out["membership"] = 0.0

        out["price"] = float(price_full or 0.0)
        out["status"] = "ok" if (out["price"] > 0 or out["membership"] > 0) else "no_price"
        return out

    except Exception as e:
        out["status"], out["error"] = "exception", f"{type(e).__name__}: {e}"
        return out


# --- ZonaFit (Shopify product .json) ---

def _get_variant_id_from_url(url: str) -> str:
    try:
        q = parse_qs(urlparse(url).query)
        return q.get("variant", [""])[0]
    except Exception:
        return ""


def scrape_zonafit(url: str, site_key: str = "zonafit") -> dict:
    out = default_result(site_key, url)

    fr = HTTP.get(url)
    out["http_status"], out["elapsed_ms"] = fr.status_code, fr.elapsed_ms
    if fr.error:
        out["status"], out["error"] = "exception", fr.error
        return out
    if fr.status_code >= 400:
        out["status"], out["error"] = "http_error", f"http_status={fr.status_code}"
        return out

    sp = soup(fr.text)
    og_img = extract_og_image_url(sp)

    base = url.split("?")[0].rstrip("/")
    json_url = base + ".json"

    frj = HTTP.get(json_url)
    if frj.error or frj.status_code >= 400:
        out["image_url"] = og_img
        out["status"] = "no_price"
        out["error"] = f"shopify_json_http_status={frj.status_code}"
        return out

    try:
        data = json.loads((frj.text or "").strip())
    except Exception as e:
        out["status"], out["error"] = "exception", f"failed_to_parse_shopify_json: {e}"
        return out

    product = data.get("product") or {}
    variants = product.get("variants") or []
    images = product.get("images") or []

    variant_id = _get_variant_id_from_url(url)
    selected = None
    for v in variants:
        if variant_id and str(v.get("id")) == str(variant_id):
            selected = v
            break
    selected = selected or (variants[0] if variants else None)

    out["price"] = parse_price_any((selected or {}).get("price"))

    # image selection
    image_url = None
    img_obj = (selected or {}).get("featured_image")
    if isinstance(img_obj, dict):
        image_url = img_obj.get("src")
    elif isinstance(img_obj, str):
        image_url = img_obj

    if not image_url and images:
        image_url = images[0] if isinstance(images[0], str) else images[0].get("src")

    out["image_url"] = image_url or og_img
    out["status"] = "ok" if out["price"] > 0 else "no_price"
    return out


# --- Vitanas / Proscience / Nutramerican / Colsubsidio / Farmatodo / Herbivore ---
# Note: These were copied as-is from original notebook with only parse function normalization.

# (To keep this response within reasonable size, we keep the remaining scrapers close to original.)


def scrape_vitanas(url: str) -> dict:
    """
    Vitanas is WooCommerce, but:
      - price comes like "$158.000" and current generic logic is parsing as 158.0
      - image is in <picture><source srcset=...> (often webp)
    So we force price parsing via parse_price_cop() and extract image from og:image / picture / img.
    """
    r = HTTP.get(url)

    out = {
        "status": "no_price",
        "error": r.error,
        "http_status": r.status_code,
        "elapsed_ms": r.elapsed_ms,
        "price": 0,
        "promo_price": 0,
        "membership": 0,
        "has_stock": True,
        "stock": 0,
        "image": "",
        "image_url": "",
    }

    if r.status_code != 200 or not r.text:
        out["status"] = "exception" if out["error"] else "http_error"
        return out

    soup = BeautifulSoup(r.text, "html.parser")

    # --- PRICE (COP) ---
    # Same root-cause fix as proscience: target single <bdi> not the container.
    price_text = ""

    meta_price = soup.select_one('meta[property="product:price:amount"]')
    if meta_price and meta_price.get("content"):
        price_text = meta_price.get("content", "")

    if not price_text:
        el = (
            soup.select_one("p.price del .woocommerce-Price-amount bdi") or
            soup.select_one("p.price del bdi") or
            soup.select_one("p.price .woocommerce-Price-amount bdi") or
            soup.select_one(".summary .price .woocommerce-Price-amount bdi") or
            soup.select_one("span.woocommerce-Price-amount bdi")
        )
        if el:
            price_text = el.get_text(" ", strip=True)

    price_cop = int(parse_price_cop(price_text) or 0)
    out["price"] = price_cop

    # --- IMAGE ---
    # 1) og:image
    og = soup.select_one('meta[property="og:image"], meta[name="og:image"]')
    if og and og.get("content"):
        out["image_url"] = og.get("content", "").strip()

    # 2) picture source srcset (matches what you showed in DevTools)
    if not out["image_url"]:
        src = soup.select_one("picture source[srcset]")
        if src and src.get("srcset"):
            # Parse first URL from srcset (inline — _first_url_from_srcset was never defined)
            out["image_url"] = src.get("srcset", "").split(",")[0].strip().split()[0]

    # 3) fallback: main product image
    if not out["image_url"]:
        img = soup.select_one("img.wp-post-image, .woocommerce-product-gallery__image img, img.attachment-shop_single")
        if img and img.get("src"):
            out["image_url"] = img.get("src", "").strip()

    out["status"] = "ok" if out["price"] > 0 else "no_price"
    return out

####################!!!!!!!PROSCIENCE!!!!!!!################

def scrape_proscience(url: str) -> dict:
    """
    Proscience pages are reachable by scraper even if browser blocks you.
    Issue observed: price comes as 159.9 / 350 instead of 159900 / 350000.
    Fix: force parse_price_cop() from common WooCommerce nodes.
    """
    r = HTTP.get(url)

    out = {
        "status": "no_price",
        "error": r.error,
        "http_status": r.status_code,
        "elapsed_ms": r.elapsed_ms,
        "price": 0,
        "promo_price": 0,
        "membership": 0,
        "has_stock": True,
        "stock": 0,
        "image": "",
        "image_url": "",
    }

    if r.status_code != 200 or not r.text:
        out["status"] = "exception" if out["error"] else "http_error"
        return out

    soup = BeautifulSoup(r.text, "html.parser")

    # PRICE
    # Root-cause fix: the old selector ("p.price") grabbed the entire WooCommerce
    # price container, which can hold BOTH the regular and sale <bdi> elements.
    # get_text() on that container concatenates all amounts into one giant string
    # (e.g. "214900214900171920171920"), causing OverflowError in Spark.
    # Fix: target the single <bdi> element for regular price, then separately
    # extract the sale price into promo_price.
    price_text = ""

    meta_price = soup.select_one('meta[property="product:price:amount"]')
    if meta_price and meta_price.get("content"):
        price_text = meta_price.get("content", "")

    if not price_text:
        # Prefer del (full/regular price), fall back to first bdi in p.price
        el = (
            soup.select_one("p.price del .woocommerce-Price-amount bdi") or
            soup.select_one("p.price del bdi") or
            soup.select_one("p.price .woocommerce-Price-amount bdi") or
            soup.select_one(".summary .price .woocommerce-Price-amount bdi") or
            soup.select_one("span.woocommerce-Price-amount bdi")
        )
        if el:
            price_text = el.get_text(" ", strip=True)

    # Promo price (ins = sale price)
    promo_el = (
        soup.select_one("p.price ins .woocommerce-Price-amount bdi") or
        soup.select_one("p.price ins bdi")
    )
    if promo_el:
        out["promo_price"] = int(parse_price_cop(promo_el.get_text(" ", strip=True)) or 0)

    out["price"] = int(parse_price_cop(price_text) or 0)

    # IMAGE
    og = soup.select_one('meta[property="og:image"], meta[name="og:image"]')
    if og and og.get("content"):
        out["image_url"] = og.get("content", "").strip()

    if not out["image_url"]:
        img = soup.select_one("img.wp-post-image, .woocommerce-product-gallery__image img, img.attachment-shop_single")
        if img and img.get("src"):
            out["image_url"] = img.get("src", "").strip()

    out["status"] = "ok" if out["price"] > 0 else "no_price"
    return out

####################!!!!!!!NUTRAMERICAN!!!!!!!################

def scrape_nutramerican(url: str, site_key: str = "nutramerican") -> dict:
    t0 = time.time()

    out = {
        "status": "no_price",
        "error": None,
        "site_raw": site_key,
        "site": site_key,
        "url": url,

        "price": 0,
        "promo_price": 0,
        "membership": 0,

        "has_stock": True,
        "stock": 0,
        "image": "",
        "image_url": "",

        "http_status": 0,
        "elapsed_ms": 0,
    }

    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
            "Accept-Language": "es-CO,es;q=0.9,en;q=0.8",
            "Connection": "keep-alive",
        }

        r = requests.get(url, headers=headers, timeout=20, allow_redirects=True)
        out["http_status"] = r.status_code
        out["elapsed_ms"] = int((time.time() - t0) * 1000)

        if r.status_code != 200 or not r.text:
            out["status"] = "http_error"
            out["error"] = f"http_status={r.status_code}"
            return out

        soup = BeautifulSoup(r.text, "html.parser")

        # -----------------------
        # PRICE (IMPORTANT FIX)
        # -----------------------
        # Nutramerican has:
        #   - s#priceDefaultDes  -> $61.99 (USD, hidden)
        #   - strong#priceDefault -> $224,990 (COP, visible)  <-- use this
        price_txt = ""
        price_el = soup.select_one("strong#priceDefault")
        if price_el:
            price_txt = price_el.get_text(" ", strip=True)

        # fallback (ONLY if priceDefault missing)
        if not price_txt:
            # try discount as fallback
            disc_el = soup.select_one("strong#priceDiscount")
            if disc_el:
                price_txt = disc_el.get_text(" ", strip=True)

        price = parse_price_cop(price_txt) if price_txt else 0
        out["price"] = int(price) if price else 0

        # -----------------------
        # PROMO PRICE (optional)
        # -----------------------
        promo_txt = ""
        promo_el = soup.select_one("strong#priceDiscount")
        if promo_el:
            promo_txt = promo_el.get_text(" ", strip=True)
        promo = parse_price_cop(promo_txt) if promo_txt else 0
        out["promo_price"] = int(promo) if promo else 0

        # -----------------------
        # IMAGE
        # -----------------------
        # Prefer og:image
        og_img = soup.select_one("meta[property='og:image']")
        if og_img and og_img.get("content"):
            out["image_url"] = og_img["content"].strip()
        else:
            # fallback: first product img if present
            img = soup.select_one("article img")
            if img and img.get("src"):
                out["image_url"] = img["src"].strip()

        # final status
        out["status"] = "ok" if out["price"] > 0 else "no_price"
        return out

    except Exception as e:
        out["status"] = "exception"
        out["error"] = f"{type(e).__name__}: {e}"
        out["elapsed_ms"] = int((time.time() - t0) * 1000)
        return out

####################!!!!!!!COLSUBSIDIO!!!!!!!################

def scrape_colsubsidio(url: str, site_key: str):
    t0 = time.time()

    out = {
        "status": "no_price",
        "error": None,
        "http_status": 0,
        "elapsed_ms": 0,
        "price": 0,
        "promo_price": 0,
        "membership": 0,
        "has_stock": True,
        "stock": 0,
        "image": "",
        "image_url": None,
    }

    try:
        resp = HTTP.get(url)  # no timeout=
        out["http_status"] = getattr(resp, "status_code", 0)
        html = getattr(resp, "text", "") or ""
        out["elapsed_ms"] = int((time.time() - t0) * 1000)

        if out["http_status"] != 200 or not html:
            out["status"] = "exception"
            out["error"] = f"http_status={out['http_status']}"
            return out

        soup = BeautifulSoup(html, "html.parser")

        # --- IMAGE (already working for you, keep it)
        og_img = soup.find("meta", attrs={"property": "og:image"})
        if og_img and og_img.get("content"):
            out["image_url"] = og_img["content"]
        else:
            img = soup.select_one('img[src*="vtexassets.com/arquivos/ids"]')
            if img and img.get("src"):
                out["image_url"] = img["src"]

        # --- PRICE attempt 1: VTEX DOM spans (works only if SSR includes price)
        int_el = soup.select_one(".vtex-product-price-1-x-currencyInteger")
        dec_el = soup.select_one(".vtex-product-price-1-x-currencyDecimal")

        int_txt = int_el.get_text(strip=True) if int_el else ""
        dec_txt = dec_el.get_text(strip=True) if dec_el else ""

        int_digits = re.sub(r"\D", "", int_txt)
        dec_digits = re.sub(r"\D", "", dec_txt)

        if int_digits and dec_digits:
            dec_digits = (dec_digits + "000")[:3]  # 226 + 750 => 226750
            candidate = int(int_digits + dec_digits)
            # Guard: if int_el captured the full formatted price (e.g. "2.267.500"),
            # int_digits already has all digits and the concat produces a 9-digit number
            # that overflows IntegerType and gives a nonsense price. Cap at 20M COP
            # and fall through to the JSON fallback if exceeded.
            if candidate <= 20_000_000:
                out["price"] = candidate

        # --- PRICE attempt 2: fallback to embedded VTEX JSON in HTML
        # Many VTEX pages embed commertialOffer with keys like:
        # "sellingPrice":226750 or "Price":226750 or "spotPrice":226750
        if out["price"] <= 0:
            candidates = []

            # Try common numeric fields in VTEX state
            patterns = [
                r'"sellingPrice"\s*:\s*([0-9]+(?:\.[0-9]+)?)',
                r'"spotPrice"\s*:\s*([0-9]+(?:\.[0-9]+)?)',
                r'"Price"\s*:\s*([0-9]+(?:\.[0-9]+)?)',
                r'"ListPrice"\s*:\s*([0-9]+(?:\.[0-9]+)?)',
                r'"price"\s*:\s*([0-9]+(?:\.[0-9]+)?)',
            ]

            for p in patterns:
                for m in re.finditer(p, html):
                    try:
                        v = float(m.group(1))
                        # filter crazy small values (e.g., 61) and keep COP-like numbers
                        if v >= 1000:
                            candidates.append(int(round(v)))
                    except Exception:
                        pass

            # choose best candidate:
            # - In VTEX, selling price typically appears many times; pick the mode-ish by using max of reasonable values
            # - safer than min (min sometimes finds grams or unrelated small fields)
            candidates = [c for c in candidates if 1000 <= c <= 20000000]
            if candidates:
                out["price"] = max(candidates)

        # --- PRICE attempt 3: last fallback text "$226.750" if present
        if out["price"] <= 0:
            m = re.search(r"\$\s*[\d\.\,]+", soup.get_text(" ", strip=True))
            if m:
                out["price"] = int(parse_price_cop(m.group(0)))

        out["status"] = "ok" if out["price"] > 0 else "no_price"
        return out

    except Exception as e:
        out["status"] = "exception"
        out["error"] = repr(e)
        out["elapsed_ms"] = int((time.time() - t0) * 1000)
        return out



####################!!!!!!!HERVIBORE!!!!!!!################

def scrape_herbivore(url: str, site_key: str):
    t0 = time.time()

    out = {
        "status": "no_price",
        "error": None,
        "http_status": 0,
        "elapsed_ms": 0,
        "price": 0,
        "promo_price": 0,
        "membership": 0,
        "has_stock": True,
        "stock": 0,
        "image": "",
        "image_url": None,
    }

    try:
        resp = HTTP.get(url)  # no timeout=
        out["http_status"] = getattr(resp, "status_code", 0)
        html = getattr(resp, "text", "") or ""
        out["elapsed_ms"] = int((time.time() - t0) * 1000)

        if out["http_status"] != 200 or not html:
            out["status"] = "exception"
            out["error"] = f"http_status={out['http_status']}"
            return out

        soup = BeautifulSoup(html, "html.parser")

        # --- PRICE: use the formatted on-page price string like "$ 199.000"
        # In your screenshot it sits in:
        # span.woocommerce-Price-amount amount > bdi
        bdi = soup.select_one("span.woocommerce-Price-amount.amount bdi")
        if bdi:
            price_txt = bdi.get_text(" ", strip=True)
            out["price"] = int(parse_price_cop(price_txt))

        # fallback: any $xxx.xxx on page
        if out["price"] <= 0:
            m = re.search(r"\$\s*[\d\.\,]+", soup.get_text(" ", strip=True))
            if m:
                out["price"] = int(parse_price_cop(m.group(0)))

        # --- IMAGE
        og_img = soup.find("meta", attrs={"property": "og:image"})
        if og_img and og_img.get("content"):
            out["image_url"] = og_img["content"]
        else:
            img = soup.select_one("img.wp-post-image") or soup.select_one('img[src*="/wp-content/uploads/"]')
            if img and img.get("src"):
                out["image_url"] = img["src"]

        out["status"] = "ok" if out["price"] > 0 else "no_price"
        return out

    except Exception as e:
        out["status"] = "exception"
        out["error"] = repr(e)
        out["elapsed_ms"] = int((time.time() - t0) * 1000)
        return out

In [0]:
# --------------------------------------------------------------------------------------
# Dispatcher + row-level wrapper
# --------------------------------------------------------------------------------------

SITE_TO_SCRAPER = {
    "savvy": lambda u: scrape_savvy(u, "savvy"),
    "zonafit": lambda u: scrape_zonafit(u, "zonafit"),
    "sinintermediarios": lambda u: scrape_sinintermediarios(u, "sinintermediarios"),

    "proscience": lambda u: scrape_proscience(u),
    "nutramerican": lambda u: scrape_nutramerican(u, "nutramerican"),
    "vitanas": lambda u: scrape_vitanas(u),

    "muscletech": lambda u: scrape_muscletech(u, "muscletech"),
    "colsubsidio": lambda u: scrape_colsubsidio(u, "colsubsidio"),
    "herbivore": lambda u: scrape_herbivore(u, "herbivore"),
    "farmatodo": lambda u: scrape_farmatodo(u, "farmatodo"),

}


def standardize_result(site_raw: str, site_key: str, url: str, r: dict) -> dict:
    """Normalize schema + ensure prices are numeric."""
    out = default_result(site_key, url)
    out.update(r or {})

    out["site"] = out.get("site") or site_key
    out["url"] = out.get("url") or url

    # Standardize timestamps
    out["scraped_at"] = datetime.datetime.now(datetime.timezone.utc).isoformat()

    # Normalize prices
    out["price"] = cop_thousands_fix(parse_price_any(out.get("price")))
    out["promo_price"] = cop_thousands_fix(parse_price_any(out.get("promo_price")))
    out["membership"] = cop_thousands_fix(parse_price_any(out.get("membership")))

    # Derive status when scraper didn't set it (or set ok with no price)
    if out.get("status") in (None, "", "ok"):
        has_any = max(out["price"], out["promo_price"], out["membership"]) > 0
        out["status"] = "ok" if has_any else "no_price"

    # Attach raw site
    out["site_raw"] = site_raw

    return out


def scrape_row(site_raw: str, url_raw: str, request_sleep_s: float = 0.25) -> dict:
    site_key = normalize_site(site_raw)
    url = clean_url(url_raw)

    if not site_key or not url:
        r = default_result(site_key or None, url or None)
        r.update({
            "site_raw": site_raw,
            "status": "invalid_input",
            "error": f"missing site/url (site_raw='{site_raw}', url_raw='{url_raw}')",
            "scraped_at": datetime.datetime.now(datetime.timezone.utc).isoformat(),
        })
        return r

    scraper = SITE_TO_SCRAPER.get(site_key)
    if not scraper:
        r = default_result(site_key, url)
        r.update({
            "site_raw": site_raw,
            "status": "unsupported_site",
            "error": f"unsupported site key '{site_key}' (raw='{site_raw}')",
            "scraped_at": datetime.datetime.now(datetime.timezone.utc).isoformat(),
        })
        return r

    try:
        raw = scraper(url) or {}
        return standardize_result(site_raw, site_key, url, raw)
    except Exception as e:
        r = default_result(site_key, url)
        r.update({
            "site_raw": site_raw,
            "status": "exception",
            "error": f"{type(e).__name__}: {str(e)[:500]}",
            "scraped_at": datetime.datetime.now(datetime.timezone.utc).isoformat(),
        })
        return r
    finally:
        time.sleep(request_sleep_s)

In [0]:
from pyspark.sql import functions as F

# ---- Source table + columns ----
MAIN_TABLE = "workspace.sinintermediarios.main_file"
URL_COL = "url"
SITE_COL = "comercio"   # <-- IMPORTANT: your store column

main_df = spark.table(MAIN_TABLE)

# ---- Build input dataframe for scraping ----
input_df = (
    main_df
    .select(
        F.col(SITE_COL).cast("string").alias("site_raw"),
        F.col(URL_COL).cast("string").alias("url_raw"),
    )
    .where(F.col("url_raw").isNotNull() & (F.length(F.trim(F.col("url_raw"))) > 0))
    .where(F.col("site_raw").isNotNull() & (F.length(F.trim(F.col("site_raw"))) > 0))
    .dropDuplicates(["site_raw", "url_raw"])
)

display(input_df.limit(50))
print("Rows to scrape:", input_df.count())

In [0]:
# ============================================================
# FINAL CELL (COPY/PASTE REPLACEMENT) — FIXES CANNOT_DETERMINE_TYPE
# Key fix: we explicitly define a Spark schema (no inference).
# Input: Spark DF `input_df` with columns: site_raw, url_raw
# Output: Spark DF `out_spark` + per-site status summary
# ============================================================

import re, time, datetime, unicodedata
from concurrent.futures import ThreadPoolExecutor, as_completed

from pyspark.sql.types import (
    StructType, StructField,
    StringType, IntegerType, LongType, BooleanType
)
from pyspark.sql import functions as F

# ---------------- Config ----------------
MAX_SITE_WORKERS = 10              # parallel across sites
MAX_URL_WORKERS_PER_SITE = 4       # parallel within a site
DEBUG_LIMIT = None                 # set e.g. 200 while debugging, None for full run

# ---------------- Helpers ----------------
def _now_utc_iso():
    return datetime.datetime.now(datetime.timezone.utc).isoformat()

def _strip_accents(s: str) -> str:
    if s is None:
        return ""
    s = str(s)
    return "".join(c for c in unicodedata.normalize("NFKD", s) if not unicodedata.combining(c))

import re

def parse_price_cop(x) -> int:
    """
    Parse COP prices robustly.
    Handles:
      - $79.990
      - $79.990,00
      - 79990.0
      - 99.000,00
      - 108,900.00
    Returns integer pesos (no cents).
    """
    if x is None:
        return 0

    s = str(x).strip()
    if not s:
        return 0

    # keep digits + separators only
    s = re.sub(r"[^\d,\.]", "", s)
    if not s:
        return 0

    # If we have a decimal part (1-2 digits), drop it.
    # Examples:
    # 99.000,00  -> decimal ',' + 2 digits
    # 108,900.00 -> decimal '.' + 2 digits
    # 79990.0    -> decimal '.' + 1 digit
    m = re.search(r"([,\.])(\d{1,2})$", s)
    if m:
        s = s[:m.start()]  # remove decimal part

    # Now remove thousands separators (any remaining '.' or ',')
    s = re.sub(r"[^\d]", "", s)

    try:
        return int(s) if s else 0
    except Exception:
        return 0

SITE_ALIASES = {
    "farmatodo": {"farmatodo", "farmatodo.com.co"},
    "mercadolibre": {"mercado libre", "mercadolibre", "mercado_libre", "ml"},
    "sinintermediarios": {"sin intermediarios", "sinintermediarios", "sin-intermediarios"},
    "colsubsidio": {"colsubsidio", "droguerias colsubsidio", "drogueriascolsubsidio"},
    "nutramerican": {"nutramerican", "nutramerican pharma", "nutramericanpharma"},
    "proscience": {"proscience"},
    "vitanas": {"vitanas"},
    "herbivore": {"herbivore"},
    "savvy": {"savvy", "youaresavvy", "you are savvy"},
    "muscletech": {"muscletech"},
    "zonafit": {"zonafit", "zona fit", "zona_fit"},
}

def normalize_site(site_raw: str) -> str:
    if not site_raw:
        return None
    s = _strip_accents(site_raw).lower().strip()
    s = re.sub(r"\s+", " ", s)

    s = s.replace("mercado libre", "mercadolibre")
    s = s.replace("zona fit", "zonafit")
    s = s.replace("sin intermediarios", "sinintermediarios")

    for canon, aliases in SITE_ALIASES.items():
        if s == canon or s in aliases:
            return canon

    s2 = re.sub(r"[^a-z0-9]+", "", s)
    for canon, aliases in SITE_ALIASES.items():
        if s2 == canon or s2 in {re.sub(r"[^a-z0-9]+", "", a) for a in aliases}:
            return canon

    return s2 or None

# ---------------- Output schema (NO inference) ----------------
OUT_SCHEMA = StructType([
    StructField("site_raw", StringType(), True),
    StructField("site", StringType(), True),
    StructField("url", StringType(), True),

    StructField("status", StringType(), True),
    StructField("error", StringType(), True),
    StructField("http_status", IntegerType(), True),
    StructField("elapsed_ms", IntegerType(), True),
    StructField("scraped_at", StringType(), True),

    StructField("price",       LongType(),    True),  # bigint: avoids OverflowError on large COP prices
    StructField("promo_price", LongType(),    True),
    StructField("membership",  LongType(),    True),

    StructField("has_stock", BooleanType(), True),
    StructField("stock", StringType(), True),

    StructField("name_scraped", StringType(), True),
    StructField("image_url", StringType(), True),
])

def _row_with_defaults(**kwargs):
    """
    Ensures every row has ALL OUT_SCHEMA fields with correct types,
    preventing Spark from failing schema inference.
    """
    base = {
        "site_raw": None, "site": None, "url": None,
        "status": None, "error": None, "http_status": None, "elapsed_ms": None, "scraped_at": None,
        "price": 0, "promo_price": 0, "membership": 0,
        "has_stock": None, "stock": None,
        "name_scraped": None, "image_url": None,
    }
    base.update(kwargs)

    # force-cast numeric fields safely
    for k in ["http_status", "elapsed_ms", "price", "promo_price", "membership"]:
        v = base.get(k)
        if v is None:
            base[k] = None
        else:
            try:
                base[k] = int(v)
            except Exception:
                base[k] = None if k in ["http_status", "elapsed_ms"] else 0

    # has_stock must be bool or None
    hs = base.get("has_stock")
    if hs is not None and not isinstance(hs, bool):
        if str(hs).lower() in ("true", "1", "yes", "y"):
            base["has_stock"] = True
        elif str(hs).lower() in ("false", "0", "no", "n"):
            base["has_stock"] = False
        else:
            base["has_stock"] = None

    return base

# ---------------- Input ----------------
if "input_df" not in globals():
    raise ValueError("input_df is not defined. Run the cell that creates input_df (site_raw/url_raw) first.")

df_for_scrape = input_df.select("site_raw", "url_raw")
if DEBUG_LIMIT:
    df_for_scrape = df_for_scrape.limit(DEBUG_LIMIT)

to_scrape_rows = [r.asDict() for r in df_for_scrape.collect()]

# ---------------- Scrape one URL ----------------
def scrape_one(site_raw: str, url_raw: str):
    site_key = normalize_site(site_raw)

    if not site_key or not url_raw:
        return _row_with_defaults(
            site_raw=site_raw, site=site_key, url=url_raw,
            status="invalid_input",
            error=f"missing site/url (site_raw='{site_raw}', url='{url_raw}')",
            scraped_at=_now_utc_iso(),
        )

    scraper = SITE_TO_SCRAPER.get(site_key) if "SITE_TO_SCRAPER" in globals() else None
    if not scraper:
        return _row_with_defaults(
            site_raw=site_raw, site=site_key, url=url_raw,
            status="unsupported_site",
            error=f"unsupported site key '{site_key}' (raw='{site_raw}')",
            scraped_at=_now_utc_iso(),
        )

    t0 = time.time()
    try:
        out = scraper.scrape(url_raw) if hasattr(scraper, "scrape") else scraper(url_raw)

        price = parse_price_cop(out.get("price"))
        promo = parse_price_cop(out.get("promo_price"))
        member = parse_price_cop(out.get("membership"))

        status = out.get("status") or "ok"
        # if we extracted a price, we treat as ok unless explicitly errored
        if status not in ("exception", "error", "http_error") and (price > 0 or promo > 0 or member > 0):
            status = "ok"

        return _row_with_defaults(
            site_raw=site_raw,
            site=site_key,
            url=url_raw,
            status=status,
            error=out.get("error"),
            http_status=out.get("http_status"),
            elapsed_ms=int((time.time() - t0) * 1000),
            scraped_at=out.get("scraped_at") or _now_utc_iso(),
            price=price,
            promo_price=promo,
            membership=member,
            has_stock=out.get("has_stock"),
            stock=out.get("stock"),
            name_scraped=out.get("name_scraped"),
            image_url=out.get("image_url"),
        )

    except Exception as e:
        return _row_with_defaults(
            site_raw=site_raw, site=site_key, url=url_raw,
            status="exception",
            error=repr(e),
            elapsed_ms=int((time.time() - t0) * 1000),
            scraped_at=_now_utc_iso(),
        )

# ---------------- Group by site ----------------
by_site = {}
for r in to_scrape_rows:
    sr = r.get("site_raw")
    u = r.get("url_raw")
    sk = normalize_site(sr)
    by_site.setdefault(sk or "unknown", []).append((sr, u))

sites = sorted(by_site.keys())
print("Sites:", sites)

# ---------------- Parallel execution ----------------
MAX_SITE_WORKERS_EFFECTIVE = min(MAX_SITE_WORKERS, max(1, len(by_site)))

def scrape_site(site_key, pairs):
    out = []
    with ThreadPoolExecutor(max_workers=MAX_URL_WORKERS_PER_SITE) as url_pool:
        futs = [url_pool.submit(scrape_one, sr, u) for (sr, u) in pairs]
        for f in as_completed(futs):
            out.append(f.result())
    return out

results = []
with ThreadPoolExecutor(max_workers=MAX_SITE_WORKERS_EFFECTIVE) as site_pool:
    futs = {site_pool.submit(scrape_site, k, v): k for k, v in by_site.items()}
    for f in as_completed(futs):
        results.extend(f.result())

# ---------------- Output (schema-fixed) ----------------
out_spark = spark.createDataFrame(results, schema=OUT_SCHEMA)
display(out_spark)

# ---------------- Summary (ok per commerce) ----------------
site_summary = (
    out_spark
    .groupBy("site")
    .agg(
        F.count("*").alias("rows"),
        F.sum(F.when(F.col("status") == "ok", 1).otherwise(0)).alias("ok_rows"),
        F.sum(F.when(F.col("status") == "unsupported_site", 1).otherwise(0)).alias("unsupported_rows"),
        F.sum(F.when(F.col("status") == "exception", 1).otherwise(0)).alias("exception_rows"),
        F.sum(F.when(F.col("status") == "invalid_input", 1).otherwise(0)).alias("invalid_rows"),
        F.sum(
            F.when(
                (F.col("status") == "ok") &
                (F.col("price") <= 0) & (F.col("promo_price") <= 0) & (F.col("membership") <= 0),
                1
            ).otherwise(0)
        ).alias("ok_but_no_price_rows"),
    )
    .withColumn("ok_rate", F.round(F.col("ok_rows") / F.col("rows"), 3))
    .orderBy(F.desc("rows"))
)
display(site_summary)

# ---------------- Error sample ----------------
errors_df = (
    out_spark
    .filter(F.col("status").isin("exception", "unsupported_site", "invalid_input"))
    .select("site", "site_raw", "status", "error", "url")
)
display(errors_df.limit(200))

In [0]:
# 3.5) MercadoLibre "pseudo-scrape" from MAIN_TABLE (main_file)
# Paste BETWEEN the scrape driver cell (creates out_spark) and Cell 10 (schema normalization).

from pyspark.sql import functions as F
from pyspark.sql.types import StringType, IntegerType

if "out_spark" not in globals():
    raise ValueError("out_spark not found. Run the scrape driver cell first (the cell that produces out_spark).")

# --- Load MAIN_TABLE (catalog / manual ML prices source) ---
main_df = spark.table(MAIN_TABLE)
cols_lc = {c.lower(): c for c in main_df.columns}

# Required cols
url_col  = cols_lc.get("url")
site_col = cols_lc.get("comercio") or cols_lc.get("site")
if not url_col or not site_col:
    raise ValueError(f"MAIN_TABLE must contain url + comercio/site. Found: {main_df.columns}")

# MercadoLibre price columns (as you confirmed)
precio_full_col      = cols_lc.get("precio_full")
precio_membresia_col = cols_lc.get("precio_membresia")
precio_dcto_col      = cols_lc.get("precio_dcto")

missing = [x for x in ["precio_full","precio_membresia","precio_dcto"] if x not in cols_lc]
if missing:
    raise ValueError(f"MAIN_TABLE is missing required MercadoLibre price columns: {missing}. Found: {main_df.columns}")

# Optional enrich columns (only used if present)
name_col  = cols_lc.get("producto") or cols_lc.get("name") or cols_lc.get("nombre")
image_col = cols_lc.get("image_url") or cols_lc.get("image") or cols_lc.get("imagen")

# Reuse notebook helpers
normalize_site_udf = F.udf(normalize_site, StringType())
parse_price_udf    = F.udf(parse_price_cop, IntegerType())

# Parse prices
parsed = (
    main_df
      .withColumn("site_norm", normalize_site_udf(F.col(site_col).cast("string")))
      .filter(F.col("site_norm") == F.lit("mercadolibre"))
      .withColumn("p_full", parse_price_udf(F.col(precio_full_col)))
      .withColumn("p_memb", parse_price_udf(F.col(precio_membresia_col)))
      .withColumn("p_dcto", parse_price_udf(F.col(precio_dcto_col)))
)

# DROP if regular price is missing or malformed (<=0 or null)
parsed_good = parsed.filter(F.col("p_full").isNotNull() & (F.col("p_full") > 0))

# Build synthetic rows matching OUT_SCHEMA (from Cell 9)
ml_rows = (
    parsed_good
      .select(
          F.col(site_col).cast("string").alias("site_raw"),
          F.lit("mercadolibre").alias("site"),
          F.col(url_col).cast("string").alias("url"),

          # Map main_file columns to scraper output fields
          F.col("p_full").cast("int").alias("price"),          # -> downstream price_full_cop
          F.coalesce(F.col("p_dcto"), F.lit(0)).cast("int").alias("promo_price"),   # -> downstream price_discount_cop
          F.coalesce(F.col("p_memb"), F.lit(0)).cast("int").alias("membership"),    # -> downstream price_membership_cop

          # Synthetic scrape metadata
          F.lit("ok").alias("status"),
          F.lit(None).cast("string").alias("error"),
          F.lit(200).cast("int").alias("http_status"),
          F.lit(0).cast("int").alias("elapsed_ms"),
          F.current_timestamp().cast("string").alias("scraped_at"),

          # Optional enrichments
          (F.col(name_col).cast("string") if name_col else F.lit(None).cast("string")).alias("name_scraped"),
          (F.col(image_col).cast("string") if image_col else F.lit(None).cast("string")).alias("image_url"),

          # Unknown for ML manual feed
          F.lit(None).cast("boolean").alias("has_stock"),
          F.lit(None).cast("string").alias("stock"),
      )
      .dropDuplicates(["url"])
)

# Enforce exact OUT_SCHEMA column order
expected_cols = [f.name for f in OUT_SCHEMA.fields]
ml_rows = ml_rows.select(*expected_cols)

# Replace any MercadoLibre rows produced by the scraper driver (unsupported_site/no_price/etc.)
out_spark = (
    out_spark
      .filter(F.col("site") != F.lit("mercadolibre"))
      .unionByName(ml_rows, allowMissingColumns=True)
)

print("✅ MercadoLibre rows injected from main_file into out_spark (dropping malformed/missing regular prices).")
display(out_spark.filter(F.col("site") == "mercadolibre").orderBy(F.col("url")).limit(200))

In [0]:
# 4) TO SPARK + NORMALIZE SCHEMA  (NO PANDAS)
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Expecting the scrape driver cell created `out_spark`
if "out_spark" not in globals():
    raise ValueError("out_spark is not defined. Run the scrape driver cell first (it produces out_spark).")

scrape_spark = out_spark

# Backward compatible columns (downstream expects *_full/_discount/_membership)
# Our driver outputs: price, promo_price, membership (all ints COP)
if "price_full" not in scrape_spark.columns:
    scrape_spark = scrape_spark.withColumn("price_full", F.col("price").cast("long"))
if "price_discount" not in scrape_spark.columns:
    scrape_spark = scrape_spark.withColumn("price_discount", F.col("promo_price").cast("long"))
if "price_membership" not in scrape_spark.columns:
    scrape_spark = scrape_spark.withColumn("price_membership", F.col("membership").cast("long"))

# Add scrape_date + COP long columns
scrape_spark = (
    scrape_spark
      .withColumn("scrape_date", F.to_date(F.current_timestamp()))
      .withColumn("price_full_cop", F.col("price_full").cast("long"))
      .withColumn("price_discount_cop", F.col("price_discount").cast("long"))
      .withColumn("price_membership_cop", F.col("price_membership").cast("long"))
)

# Best price (min positive among discount/membership/full)
scrape_spark = scrape_spark.withColumn(
    "price_cop",
    F.least(
        F.when(F.col("price_discount_cop") > 0, F.col("price_discount_cop")),
        F.when(F.col("price_full_cop") > 0, F.col("price_full_cop")),
    )
)

# If all are null (no positive prices), set to 0
scrape_spark = scrape_spark.withColumn("price_cop", F.coalesce(F.col("price_cop"), F.lit(0).cast("long")))

# De-dup within this run to keep one row per (scrape_date, url)
# Use scraped_at if present; otherwise fallback to current_timestamp ordering
order_col = F.col("scraped_at").desc() if "scraped_at" in scrape_spark.columns else F.current_timestamp().desc()
w = Window.partitionBy("scrape_date", "url").orderBy(order_col)

scrape_spark = (
    scrape_spark
      .withColumn("_rn", F.row_number().over(w))
      .filter(F.col("_rn") == 1)
      .drop("_rn")
)

display(
    scrape_spark.select(
        "site","url",
        "price_full_cop","price_discount_cop","price_membership_cop","price_cop",
        "status","error"
    ).limit(200)
)

In [0]:
# 5) WRITE SCRAPE SNAPSHOT (append)

from pyspark.sql import functions as F
from pyspark.sql.types import StringType

def table_exists(full_name: str) -> bool:
    try:
        spark.table(full_name)
        return True
    except Exception:
        return False

def align_schema_to_table(df, table_name: str):
    """
    Cast df columns to match the existing Delta table schema.
    Prevents DELTA_FAILED_TO_MERGE_FIELDS when a column type changed
    (e.g. price went from StringType -> LongType).
    Only casts columns that already exist in both df and table.
    """
    existing = spark.table(table_name).schema
    for field in existing:
        if field.name in df.columns:
            df = df.withColumn(field.name, F.col(field.name).cast(field.dataType))
    return df

if not table_exists(SCRAPE_TABLE):
    (scrape_spark
      .write.format('delta')
      .mode('overwrite')
      .partitionBy('scrape_date')
      .saveAsTable(SCRAPE_TABLE))
    print(f'Created {SCRAPE_TABLE}')
else:
    aligned = align_schema_to_table(scrape_spark, SCRAPE_TABLE)
    (aligned
      .write.format('delta')
      .mode('append')
      .option('mergeSchema', 'true')
      .saveAsTable(SCRAPE_TABLE))
    print(f'Appended to {SCRAPE_TABLE}')

# Quick peek
display(spark.table(SCRAPE_TABLE).orderBy(F.col('scrape_date').desc(), F.col('site')).limit(200))