# SinIntermediarios â€” Build Silver (Dashboard)

**Purpose**
- Read the product catalog (`MAIN_TABLE`).
- Read the daily scrape snapshot (`SCRAPE_TABLE`) for a chosen `TARGET_SCRAPE_DATE` (defaults to latest available).
- Join + standardize schema.
- Write:
  - `SILVER_HISTORY_TABLE` (MERGE/upsert by (scrape_date, url))
  - `SILVER_DASHBOARD_TABLE` (overwrite snapshot for dashboards)

In [0]:
%pip install beautifulsoup4 lxml
dbutils.library.restartPython()

In [0]:
# 1) CONFIG
from pyspark.sql import functions as F

MAIN_TABLE  = "workspace.sinintermediarios.main_file"
SCRAPE_TABLE = "workspace.sinintermediarios.bronze_scrape_daily"

SILVER_HISTORY_TABLE = "workspace.sinintermediarios.silver_all_history"
SILVER_DASHBOARD_TABLE = "workspace.sinintermediarios.silver_dashboard_latest"

TARGET_SCRAPE_DATE = None  # 'YYYY-MM-DD' to force, else latest in SCRAPE_TABLE

In [0]:
# Databricks notebook source
# --------------------------------------------------------------------------------------
# Helpers: HTTP, parsing, normalization
# --------------------------------------------------------------------------------------
import datetime
import json
import re
import time
from dataclasses import dataclass
from typing import Any, Dict, Optional
from urllib.parse import urlparse, parse_qs

import requests
from bs4 import BeautifulSoup

DEFAULT_HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Connection": "keep-alive",
}


def soup(html: str) -> BeautifulSoup:
    return BeautifulSoup(html or "", "lxml")


@dataclass
class FetchResult:
    status: str
    error: str
    http_status: int
    elapsed_ms: int
    html: str

    @property
    def status_code(self) -> int:
        return self.http_status

    @property
    def text(self) -> str:
        return self.html


class HttpClient:
    """Tiny wrapper to keep scraper code consistent and add minimal resiliency."""

    def __init__(self, headers: Optional[dict] = None):
        self.session = requests.Session()
        self.session.headers.update(headers or DEFAULT_HEADERS)

    def get(self, url: str, timeout_s: int = 25) -> FetchResult:
        t0 = time.time()
        try:
            r = self.session.get(url, timeout=timeout_s, allow_redirects=True)
            return FetchResult(
                status="ok",
                error="",
                http_status=int(r.status_code),
                elapsed_ms=int((time.time() - t0) * 1000),
                html=r.text or "",
            )
        except Exception as e:
            return FetchResult(
                status="exception",
                error=f"{type(e).__name__}: {e}",
                http_status=0,
                elapsed_ms=int((time.time() - t0) * 1000),
                html="",
            )


HTTP = HttpClient()


def default_result(site: Optional[str] = None, url: Optional[str] = None) -> Dict[str, Any]:
    """Canonical result shape for ALL scrapers."""
    return {
        "site": site,
        "url": url,
        "status": "ok",
        "error": None,
        "http_status": None,
        "elapsed_ms": None,
        "scraped_at": None,
        # raw prices (floats) - normalized later
        "price": 0.0,          # regular / single purchase
        "promo_price": 0.0,    # discounted
        "membership": 0.0,     # subscription / membership
        "has_stock": None,
        "stock": None,
        "name_scraped": None,
        "image_url": None,
    }


def extract_og_image_url(sp: BeautifulSoup) -> str:
    for sel in [
        'meta[property="og:image"]',
        'meta[name="og:image"]',
        'meta[name="twitter:image"]',
        'meta[property="twitter:image"]',
    ]:
        tag = sp.select_one(sel)
        if tag and tag.get("content"):
            u = tag["content"].strip()
            if u.startswith("//"):
                u = "https:" + u
            return u
    return ""


def normalize_site(site_raw: Optional[str]) -> str:
    if site_raw is None:
        return ""
    s = str(site_raw).strip().lower()
    s = s.replace("_", " ").replace("-", " ")
    s = re.sub(r"\s+", " ", s).strip()

    mapping = {
        "savvy": "savvy",
        "mercado libre": "mercadolibre",
        "proscience": "proscience",
        "zona fit": "zonafit",
        "sin intermediarios": "sinintermediarios",
        "vitanas": "vitanas",
        "farmatodo": "farmatodo",
        "herbivore": "herbivore",
        "nutramerican": "nutramerican",
        "colsubsidio": "colsubsidio",
        "muscletech": "muscletech",
    }
    return mapping.get(s, s.replace(" ", ""))


def clean_url(url: Optional[str]) -> str:
    if url is None:
        return ""
    u = str(url).strip()
    if u.lower() in ("nan", "none", ""):
        return ""
    return u.split("#")[0].strip()


def parse_price_any(val: Any) -> float:
    """Parse common price strings into a float.

    Handles:
      - numbers (int/float)
      - COP strings with thousand separators: "$174.100" / "$195,900" / "174.100,00"
      - Shopify JSON strings: "164990.00"
    """
    if val is None:
        return 0.0
    if isinstance(val, (int, float)):
        try:
            return float(val)
        except Exception:
            return 0.0

    s = str(val).strip()
    if s == "":
        return 0.0

    # Keep only digits + separators
    s2 = re.sub(r"[^0-9\,\.]", "", s)
    if s2 == "":
        return 0.0

    # If it looks like COP with thousand separators and optional cents, normalize to pesos
    # e.g., "174.100" or "174.100,00" or "195,900"
    has_cents = bool(re.search(r"[\,\.]\d{2}$", s2))

    # Determine decimal separator if both present
    if "," in s2 and "." in s2:
        # decimal separator is the last occurring
        if s2.rfind(",") > s2.rfind("."):
            # "." thousand, "," decimal
            s2 = s2.replace(".", "").replace(",", ".")
        else:
            # "," thousand, "." decimal
            s2 = s2.replace(",", "")
    elif "," in s2 and "." not in s2:
        # Could be thousand or decimal. If ends with 2 decimals -> decimal; else thousand.
        if has_cents:
            s2 = s2.replace(",", ".")
        else:
            s2 = s2.replace(",", "")
    # else: only '.' or none -> float will handle

    try:
        x = float(s2)
    except Exception:
        return 0.0

    # If cents were explicitly present, convert to pesos by flooring cents.
    # (For COP, cents are not used in practice; Shopify may emit ".00".)
    if has_cents:
        x = float(int(x))

    return x


def cop_thousands_fix(x: float) -> float:
    """Heuristic: COP prices rarely in 1..999 range; if they are, it's often missing thousands."""
    try:
        v = float(x or 0.0)
    except Exception:
        return 0.0
    if 1 < v < 1000:
        return v * 1000
    return v

In [0]:
# 2) LOAD MAIN + SCRAPE (target date)
from pyspark.sql import functions as F
from pyspark.sql.window import Window

main_df = spark.table(MAIN_TABLE)
scrape_df_all = spark.table(SCRAPE_TABLE)

# pick latest scrape_date if not provided
if TARGET_SCRAPE_DATE:
    target_date = TARGET_SCRAPE_DATE
else:
    target_date = (scrape_df_all.select(F.max('scrape_date').alias('d')).collect()[0]['d'])

print('Using scrape_date:', target_date)

scrape_df = scrape_df_all.filter(F.col('scrape_date') == F.lit(target_date))

# De-dup: keep latest scraped_at per url (prefer status ok)
# status priority: ok > no_price > others
status_rank = F.when(F.col('status')=='ok', F.lit(3))
status_rank = status_rank.when(F.col('status')=='no_price', F.lit(2))
status_rank = status_rank.otherwise(F.lit(1))

w = Window.partitionBy('url').orderBy(status_rank.desc(), F.to_timestamp('scraped_at').desc_nulls_last())
scrape_df = (scrape_df
    .withColumn('_rn', F.row_number().over(w))
    .filter(F.col('_rn')==1)
    .drop('_rn')
)

print('Main rows:', main_df.count())
print('Scrape rows (dedup):', scrape_df.count())

In [0]:
# 3) JOIN + CLEAN  (FIX: avoid ambiguous 'site' column)
from pyspark.sql import functions as F
from pyspark.sql.types import StringType

# --- resolve main columns robustly ---
cols_lc = {c.lower(): c for c in main_df.columns}

url_col = cols_lc.get("url")
site_raw_col = cols_lc.get("comercio") or cols_lc.get("site")  # you said site should be comercio

if not url_col:
    raise ValueError(f"MAIN_TABLE must contain 'url'. Found: {main_df.columns}")

if not site_raw_col:
    main_df = main_df.withColumn("_site_raw_fallback", F.lit(None))
    site_raw_col = "_site_raw_fallback"

# --- avoid name collision: rename main 'site' if it exists ---
# If main_df already has a column literally called "site", rename it so we don't collide.
if "site" in main_df.columns:
    main_df = main_df.withColumnRenamed("site", "site_main")

m = main_df.alias("m")
s = scrape_df.alias("s")

# --- join on trimmed URL ---
joined = (
    m.join(
        s,
        on=(F.trim(F.col(f"m.{url_col}")) == F.trim(F.col("s.url"))),
        how="left",
    )
    .withColumn("site_raw", F.trim(F.col(f"m.{site_raw_col}")))
)

# --- normalized site: prefer scrape site, else normalize from site_raw ---
normalize_site_udf = F.udf(normalize_site, StringType())

joined = joined.withColumn(
    "site_norm",
    F.coalesce(F.col("s.site"), normalize_site_udf(F.col("site_raw")))
)

# --- keep all original product metadata (including comercio, marca, gramos_empaque, etc.) ---
# IMPORTANT: do not drop url_col or site_raw_col from meta; we keep them as-is in metadata
meta_cols = [c for c in main_df.columns if c != url_col]  # url is normalized separately

clean_df = joined.select(
    F.col("site_raw"),
    F.col("site_norm").alias("site"),
    F.trim(F.col(f"m.{url_col}")).alias("url"),

    F.col("s.status").alias("status"),
    F.col("s.error").alias("error"),
    F.col("s.http_status").alias("http_status"),
    F.col("s.elapsed_ms").alias("elapsed_ms"),
    F.col("s.scraped_at").alias("scraped_at"),
    F.col("s.scrape_date").alias("scrape_date"),

    F.col("s.price_full_cop").alias("price_full_cop"),
    F.col("s.price_discount_cop").alias("price_discount_cop"),
    F.col("s.price_membership_cop").alias("price_membership_cop"),
    F.col("s.price_cop").alias("price_cop"),

    F.col("s.has_stock").alias("has_stock"),
    F.col("s.stock").alias("stock"),
    F.col("s.image_url").alias("image_url"),
    F.col("s.name_scraped").alias("name_scraped"),

    *[F.col(f"m.{c}").alias(c) for c in meta_cols if c not in ["site_raw", "site_norm"]]
)

display(clean_df.limit(200))

In [0]:
# 4) WRITE SILVER TABLES (snapshot + history create/merge)
from pyspark.sql import functions as F
from pyspark.sql.window import Window

def table_exists(full_name: str) -> bool:
    # Most reliable across DBR / Spark Connect / UC
    try:
        spark.sql(f"DESCRIBE TABLE {full_name}")
        return True
    except Exception:
        return False

print("SILVER_DASHBOARD_TABLE =", SILVER_DASHBOARD_TABLE)
print("SILVER_HISTORY_TABLE   =", SILVER_HISTORY_TABLE)

# -------------------------
# 0) Defensive de-dupe: 1 row per (scrape_date, url)
# -------------------------
order_col = F.col("scraped_at").desc() if "scraped_at" in clean_df.columns else F.current_timestamp().desc()
w = Window.partitionBy("scrape_date", "url").orderBy(order_col)

clean_hist = (
    clean_df
      .withColumn("_rn", F.row_number().over(w))
      .filter(F.col("_rn") == 1)
      .drop("_rn")
)

# -------------------------
# 4a) Snapshot (overwrite)
# -------------------------
(
    clean_df
      .write.format("delta")
      .mode("overwrite")
      .option("overwriteSchema", "true")
      .saveAsTable(SILVER_DASHBOARD_TABLE)
)
print("Wrote dashboard snapshot:", SILVER_DASHBOARD_TABLE)

# -------------------------
# 4b) History (create if missing, else schema-safe merge)
# -------------------------
if not table_exists(SILVER_HISTORY_TABLE):
    (
        clean_hist
          .write.format("delta")
          .mode("overwrite")
          .partitionBy("scrape_date")
          .saveAsTable(SILVER_HISTORY_TABLE)
    )
    print("Created history table:", SILVER_HISTORY_TABLE)

# else:
#     tgt = spark.table(SILVER_HISTORY_TABLE)
#     tgt_cols = tgt.columns

#     # Align src to target schema (add missing cols as nulls, drop extras)
#     src_aligned = clean_hist
#     for c in tgt_cols:
#         if c not in src_aligned.columns:
#             src_aligned = src_aligned.withColumn(c, F.lit(None).cast(tgt.schema[c].dataType))
#     src_aligned = src_aligned.select(*tgt_cols)

#     src_aligned.createOrReplaceTempView("src_aligned")

#     update_set = ",\n      ".join([f"t.`{c}` = s.`{c}`" for c in tgt_cols if c not in ["scrape_date", "url"]])
#     insert_cols = ", ".join([f"`{c}`" for c in tgt_cols])
#     insert_vals = ", ".join([f"s.`{c}`" for c in tgt_cols])

#     spark.sql(f"""
#       MERGE INTO {SILVER_HISTORY_TABLE} AS t
#       USING src_aligned AS s
#       ON t.scrape_date = s.scrape_date AND t.url = s.url
#       WHEN MATCHED THEN UPDATE SET
#         {update_set}
#       WHEN NOT MATCHED THEN INSERT ({insert_cols})
#       VALUES ({insert_vals})
#     """)

else:
    tgt = spark.table(SILVER_HISTORY_TABLE)
    tgt_cols = tgt.columns

    # 1) EVOLVE TARGET SCHEMA: add any new columns from source into target table
    src_schema = {f.name: f.dataType.simpleString() for f in clean_hist.schema.fields}
    new_cols = [c for c in src_schema.keys() if c not in tgt_cols]

    if new_cols:
        add_cols_ddl = ", ".join([f"`{c}` {src_schema[c]}" for c in new_cols])
        spark.sql(f"ALTER TABLE {SILVER_HISTORY_TABLE} ADD COLUMNS ({add_cols_ddl})")
        tgt = spark.table(SILVER_HISTORY_TABLE)
        tgt_cols = tgt.columns

    # 2) Align source to updated target schema (add missing cols as null, order cols)
    src_aligned = clean_hist
    tgt_schema = tgt.schema

    for c in tgt_cols:
        if c not in src_aligned.columns:
            src_aligned = src_aligned.withColumn(c, F.lit(None).cast(tgt_schema[c].dataType))

    src_aligned = src_aligned.select(*tgt_cols)
    src_aligned.createOrReplaceTempView("src_aligned")

    update_set = ",\n      ".join([f"t.`{c}` = s.`{c}`" for c in tgt_cols if c not in ["scrape_date", "url"]])
    insert_cols = ", ".join([f"`{c}`" for c in tgt_cols])
    insert_vals = ", ".join([f"s.`{c}`" for c in tgt_cols])

    spark.sql(f"""
      MERGE INTO {SILVER_HISTORY_TABLE} AS t
      USING src_aligned AS s
      ON t.scrape_date = s.scrape_date AND t.url = s.url
      WHEN MATCHED THEN UPDATE SET
        {update_set}
      WHEN NOT MATCHED THEN INSERT ({insert_cols})
      VALUES ({insert_vals})
    """)
    print("Upserted history table:", SILVER_HISTORY_TABLE)

# -------------------------
# Peek
# -------------------------
display(
    spark.table(SILVER_HISTORY_TABLE)
      .orderBy(F.col("scrape_date").desc(), F.col("site").asc_nulls_last())
      .limit(200)
)