In [5]:
# -*- coding: utf-8 -*-
import sys, os, re, json, time, unicodedata, io
from gzip import BadGzipFile
from pathlib import Path
from typing import List, Dict, Optional, Tuple
from urllib.parse import urljoin, urlparse, parse_qs, urlunparse

import requests
import pandas as pd
from bs4 import BeautifulSoup

# ===== seus dicionários / regras =====
sys.path.append(os.path.abspath("/home/usuario/Área de trabalho/Dados/models"))
from skin import SKIN_TYPE_CANONICAL_ORDER, SKIN_TYPE_SYNONYMS_PT
from exclude import EXCLUDE_KEYWORDS
from ingredient import INGREDIENTES_VALIDOS
from benefits import BENEFIT_SYNONYMS_PT, BENEFIT_CANONICAL_ORDER
from category import CATEGORY_CANONICAL_ORDER, CATEGORY_HINTS

# ================== Config ==================
BASE_URL = "https://www.farmax.com.br"
JSON_PATH = Path("farmax_products.json")
CSV_PATH  = Path("farmax_products.csv")

IMG_DIR = Path("./images/farmax")
IMG_DIR.mkdir(parents=True, exist_ok=True)

SESSION = requests.Session()
SESSION.headers.update({
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124 Safari/537.36",
    "Accept-Language": "pt-BR,pt;q=0.9,en;q=0.1",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
})

NBSP = "\xa0"
CURRENCY_RE = re.compile(r"R\$\s*([\d\.]+,\d{2})")
QTY_RE      = re.compile(r"(\d+(?:[.,]\d+)?)\s*(ml|g|l|kg|un|unid|unidades)\b", re.I)

CATEGORY_URLS: List[str] = [
    "https://www.farmax.com.br/skin-care/limpeza",
    "https://www.farmax.com.br/skin-care/hidratantes",
    "https://www.farmax.com.br/skin-care/seruns-de-tratamento",
    "https://www.farmax.com.br/skin-care/protecao-solar",
    "https://www.farmax.com.br/skin-care/mascaras-faciais",
    "https://www.farmax.com.br/skin-care/esfoliantes",
    "https://www.farmax.com.br/skin-care/agua-termal",
    "https://www.farmax.com.br/skin-care/labiais",
]

PDP_URLS: List[str] = [
    # ex.: "https://www.farmax.com.br/protetor-solar-toque-seco-sunless-fps60-120g/p"
]

# ================== Helpers ==================
def _strip(s: str) -> str:
    return re.sub(r"\s+", " ", s or "").strip()

def _norm(s: str) -> str:
    s = (s or "").replace(NBSP, " ")
    s = unicodedata.normalize("NFKD", s)
    s = "".join(c for c in s if not unicodedata.combining(c))
    s = re.sub(r"[\s_/,-]+", " ", s)
    return s.strip().lower()

def sanitize_filename(s: str) -> str:
    s = _norm(s)
    s = re.sub(r"[^a-z0-9\._-]+", "-", s)
    s = re.sub(r"-{2,}", "-", s).strip("-")
    return s or "img"

def br_money_to_float_str(text: str) -> Optional[str]:
    if not text:
        return None
    m = CURRENCY_RE.search(text)
    if not m:
        return None
    raw = m.group(1).replace(".", "").replace(",", ".")
    try:
        return f"{float(raw):.2f}"
    except ValueError:
        return None

def extract_quantity(text: str) -> Optional[str]:
    if not text:
        return None
    m = QTY_RE.search(text)
    if not m:
        return None
    val = m.group(1).replace(",", ".")
    unit = m.group(2).lower()
    unit = {"unid": "un", "unidades": "un"}.get(unit, unit)
    return f"{val}{unit}" if unit in {"ml","g","l","kg"} else f"{val} {unit}"

def should_exclude(name: str) -> bool:
    n = _norm(name)
    for kw in list(EXCLUDE_KEYWORDS) + ["kit", "refil", "refill", "combo", "duo", "trio", "necessaire", "presente", "gift"]:
        if _norm(kw) in n:
            return True
    return False

def get_soup(url: str) -> Optional[BeautifulSoup]:
    for i in range(3):
        try:
            r = SESSION.get(url, timeout=30)
            if r.status_code == 200:
                return BeautifulSoup(r.text, "lxml")
        except requests.RequestException:
            pass
        time.sleep(1 + i)
    return None

def collect_product_links_from_category(url: str) -> List[str]:
    soup = get_soup(url)
    if not soup:
        return []
    links = set()
    for a in soup.select("a.vtex-product-summary-2-x-clearLink, a"):
        href = a.get("href") or ""
        if not href:
            continue
        if href.startswith("//"):
            href = "https:" + href
        elif href.startswith("/"):
            href = urljoin(BASE_URL, href)
        if re.search(r"/p(\?|$)", href):
            links.add(href)
    return sorted(links)

# ================== PDP fields ==================
def extract_name_subtitle_qty_farmax(soup: BeautifulSoup) -> Tuple[str, Optional[str], Optional[str]]:
    name = ""
    span = soup.select_one("span.vtex-store-components-3-x-productBrand, span[class*='vtex-store-components-3-x-productBrand']")
    if span:
        name = _strip(span.get_text())
    subtitle = None
    qty = extract_quantity(name)
    return name, subtitle, qty

def extract_price_str_farmax(soup: BeautifulSoup) -> Optional[str]:
    prices = set()
    for el in soup.select("span.locateoffer__price, span, div, p"):
        val = br_money_to_float_str(el.get_text(" ", strip=True))
        if val:
            prices.add(val)
    return f"{min(prices, key=lambda x: float(x))}" if prices else None

def extract_skin_text_farmax(soup: BeautifulSoup) -> str:
    blocks = soup.select("div.vtex-store-components-3-x-specificationsTab")
    longest = ""
    for b in blocks:
        t = _strip(b.get_text(" ", strip=True))
        if len(t) > len(longest):
            longest = t
    return longest

def map_skin_types_from_text(text: str) -> List[str]:
    t = _norm(text)
    if "todos os tipos de pele" in t:
        return ["todos os tipos"]
    out = set()
    for canonical, syns in SKIN_TYPE_SYNONYMS_PT.items():
        for s in syns + [canonical]:
            if _norm(s) in t:
                out.add(canonical)
                break
    ordered = [s for s in SKIN_TYPE_CANONICAL_ORDER if s in out]
    return ordered or ["todos os tipos"]

def extract_benefits_farmax(soup: BeautifulSoup) -> List[str]:
    candidates = []
    for el in soup.select("li, p"):
        txt = _strip(el.get_text(" ", strip=True))
        if re.match(r"^[•\-\u2022]\s*", txt):
            txt = re.sub(r"^[•\-\u2022]\s*", "", txt).strip()
            if 3 <= len(txt) <= 240:
                candidates.append(txt)
    spec_text = extract_skin_text_farmax(soup)
    if spec_text:
        for sent in re.split(r"[.;]\s+", spec_text):
            s = _strip(sent)
            if any(k in _norm(s) for k in ["hidrata", "toque seco", "prote", "macia", "revitaliz", "antissinais", "uniformiza", "oleos", "matte", "uva", "uvb"]):
                candidates.append(s)
    found = set()
    joined = " " + " ; ".join(_norm(x) for x in candidates) + " "
    for canonical in BENEFIT_CANONICAL_ORDER:
        syns = BENEFIT_SYNONYMS_PT.get(canonical, [])
        if any(re.search(rf"\b{re.escape(_norm(s))}\b", joined) for s in syns + [canonical]):
            found.add(canonical)
    return [b for b in BENEFIT_CANONICAL_ORDER if b in found]

def extract_ingredients_text_farmax(soup: BeautifulSoup) -> str:
    for tag in soup.find_all(["button", "div", "span", "p", "h2", "h3", "h4"]):
        label = _strip(tag.get_text())
        if _norm(label) in {"ingredientes", "composicao", "composição"}:
            sib = tag.find_next_sibling()
            steps = 0
            chunks = []
            while sib and steps < 12:
                for s in sib.select("span, p, div"):
                    t = _strip(s.get_text(" ", strip=True))
                    if len(t) > 20:
                        chunks.append(t)
                sib = sib.find_next_sibling()
                steps += 1
            if chunks:
                chunks.sort(key=len, reverse=True)
                return chunks[0]
    candidate = ""
    for el in soup.select("span, p, div"):
        tt = _strip(el.get_text(" ", strip=True))
        if (tt.count(",") >= 6 or "/" in tt) and len(tt) > len(candidate):
            candidate = tt
    return candidate

def tokenize_and_filter_ingredients(raw_text: str) -> List[str]:
    if not raw_text:
        return []
    parts: List[str] = []
    for chunk in re.split(r",", raw_text):
        parts.extend(re.split(r"/", chunk))
    valid_norm_map = {_norm(v): v for v in INGREDIENTES_VALIDOS}
    out, seen = [], set()
    for p in parts:
        tok = _strip(p).strip().strip(".")
        if not tok:
            continue
        key = _norm(tok)
        matched = None
        if key in valid_norm_map:
            matched = valid_norm_map[key]
        else:
            for k_valid, v_canon in valid_norm_map.items():
                if k_valid and (k_valid in key or key in k_valid) and len(k_valid) >= 4:
                    matched = v_canon
                    break
        if matched:
            k2 = _norm(matched)
            if k2 not in seen:
                seen.add(k2)
                out.append(matched)
    return out

def classify_category(name: str, subtitle: Optional[str]) -> Optional[str]:
    base = f"{name} {subtitle or ''}"
    text = _norm(base)
    hits = []
    for cat, hints in CATEGORY_HINTS.items():
        if any(_norm(h) in text for h in hints):
            hits.append(cat)
    if not hits:
        return None
    for cat in CATEGORY_CANONICAL_ORDER:
        if cat in hits:
            return cat
    return hits[0]

# ================== Imagens (100%) ==================
SRCSET_RE = re.compile(r"\s*(\S+)\s+(\d+)w\s*")

def _parse_srcset(srcset: str) -> List[Tuple[str, int]]:
    """
    Retorna [(url, width_int), ...] a partir de um srcset.
    """
    out = []
    if not srcset:
        return out
    for part in srcset.split(","):
        part = part.strip()
        m = SRCSET_RE.match(part)
        if m:
            url, w = m.group(1), int(m.group(2))
            out.append((url, w))
        else:
            # pode vir sem 'w' (raro). Tenta pegar só a URL.
            url = part.split()[0]
            if url:
                out.append((url, 0))
    return out

def _canonicalize_url(u: str) -> str:
    """
    Normaliza a URL para deduplicar (remove fragmento, mantém query porque define tamanho).
    """
    pr = urlparse(u)
    return urlunparse((pr.scheme or "https", pr.netloc, pr.path, "", pr.query, ""))

def extract_all_product_image_urls_farmax(soup: BeautifulSoup) -> List[str]:
    """
    Coleta TODAS as imagens de produto do PDP (src + todos do srcset),
    prioriza vtexassets/arquivos/files/assets, e ordena por largura desc.
    """
    blacklist = ("logo", "icone", "icon", "sprite", "favicon", "/icons/")
    def _bad(u: str) -> bool:
        lu = (u or "").lower()
        return any(b in lu for b in blacklist)

    candidates: Dict[str, int] = {}  # url -> width score

    # 1) picture/source[srcset]
    for src in soup.select("picture source[srcset]"):
        for url, w in _parse_srcset(src.get("srcset", "")):
            if any(k in url for k in ("vtexassets", "/arquivos/", "/files/", "/assets/")) and not _bad(url):
                candidates[_canonicalize_url(url)] = max(candidates.get(url, 0), w)

    # 2) img (classe principal e demais)
    for img in soup.select("img.vtex-store-components-3-x-productImageTag, img"):
        # src direto
        s = img.get("src")
        if s and any(k in s for k in ("vtexassets", "/arquivos/", "/files/", "/assets/")) and not _bad(s):
            candidates[_canonicalize_url(s)] = max(candidates.get(s, 0), 0)
        # srcset do <img>
        for url, w in _parse_srcset(img.get("srcset", "")):
            if any(k in url for k in ("vtexassets", "/arquivos/", "/files/", "/assets/")) and not _bad(url):
                candidates[_canonicalize_url(url)] = max(candidates.get(url, 0), w)

    # ordena por largura desc (w), depois por comprimento do path
    def score(u: str) -> Tuple[int, int]:
        pr = urlparse(u)
        return (candidates.get(u, 0), len(pr.path))

    ordered = sorted(set(candidates.keys()), key=score, reverse=True)
    return ordered

def _ext_from_url(u: str) -> str:
    p = urlparse(u).path.lower()
    for ext in (".jpg", ".jpeg", ".png", ".webp"):
        if p.endswith(ext):
            return ext
    return ".jpg"

def download_images(img_urls: List[str], product_name: str, referer: str) -> List[str]:
    """
    Baixa TODAS as imagens (ou as que conseguirmos) e retorna a lista de filenames salvos.
    """
    saved = []
    base = sanitize_filename(product_name)
    for idx, url in enumerate(img_urls, 1):
        if not url:
            continue
        # absolutiza
        if url.startswith("//"):
            url = "https:" + url
        elif url.startswith("/"):
            url = urljoin(BASE_URL, url)

        ext = _ext_from_url(url)
        fname = f"{base}-{idx}{ext}"
        fpath = IMG_DIR / fname

        try:
            # set Referer para este download
            headers = SESSION.headers.copy()
            headers["Referer"] = referer

            r = SESSION.get(url, timeout=40, headers=headers)
            if r.status_code == 200:
                ctype = (r.headers.get("Content-Type") or "").lower()
                if not ctype.startswith("image"):
                    continue
                if len(r.content) < 6000:  # ~6 KB: evita placeholders muito pequenos
                    continue
                with open(fpath, "wb") as f:
                    f.write(r.content)
                saved.append(fname)
        except requests.RequestException:
            continue
    return saved

# ================== Parse completo ==================
def parse_farmax_product(url: str) -> Optional[Dict]:
    soup = get_soup(url)
    if not soup:
        return None

    name, subtitle, qty = extract_name_subtitle_qty_farmax(soup)
    if not name or should_exclude(name):
        return None

    price = extract_price_str_farmax(soup)
    skin_text = extract_skin_text_farmax(soup)
    skin_types = map_skin_types_from_text(skin_text)
    benefits = extract_benefits_farmax(soup)

    ing_text = extract_ingredients_text_farmax(soup)
    ingredients = tokenize_and_filter_ingredients(ing_text)

    category = classify_category(name, subtitle)

    # === IMAGENS: pega todas e baixa todas; no JSON/CSV mantém só a 1ª em 'imagem' ===
    all_img_urls = extract_all_product_image_urls_farmax(soup)
    saved_files = download_images(all_img_urls, name, referer=url)
    main_image = saved_files[0] if saved_files else None

    return {
        "marca": "farmax",
        "nome": name,
        "subtitulo": subtitle if subtitle else None,
        "categoria": category,
        "quantidade": qty,
        "preco": price,
        "beneficios": "; ".join(benefits) if benefits else None,
        "ingredientes": "; ".join(ingredients) if ingredients else None,
        "tipo_pele": "; ".join(skin_types) if skin_types else None,
        "imagem": main_image,          # primeira salva
        "url": url,                    # útil para auditoria
        # As demais imagens ficam salvas no disco como {slug}-2.jpg, {slug}-3.jpg, ...
    }

# ================== Main ==================
def main():
    product_urls: List[str] = []
    for cat in CATEGORY_URLS:
        product_urls.extend(collect_product_links_from_category(cat))
    product_urls.extend(PDP_URLS)

    # de-dup por URL
    product_urls = sorted(set(product_urls))
    print(f"[Farmax] URLs a processar: {len(product_urls)}")

    results = []
    seen_urls = set()
    for i, url in enumerate(product_urls, 1):
        if url in seen_urls:
            print(f"[{i}/{len(product_urls)}] pulado (URL repetida) - {url}")
            continue
        seen_urls.add(url)

        try:
            it = parse_farmax_product(url)
            status = "ok" if it else "descartado"
            if it:
                results.append(it)
        except Exception as e:
            status = f"erro:{e.__class__.__name__}"
        print(f"[{i}/{len(product_urls)}] {status} - {url}")
        time.sleep(0.15)

    with open(JSON_PATH, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    pd.DataFrame(results).to_csv(CSV_PATH, index=False, encoding="utf-8-sig")

    print(f"Salvo JSON: {JSON_PATH}")
    print(f"Salvo CSV : {CSV_PATH}")
    print(f"Itens válidos: {len(results)}")

if __name__ == "__main__":
    main()


[Farmax] URLs a processar: 31
[1/31] ok - https://www.farmax.com.br/agua-micelar-hialuronico-hidraderm-200ml/p
[2/31] ok - https://www.farmax.com.br/agua-micelar-matte-hidraderm-200ml/p
[3/31] ok - https://www.farmax.com.br/agua-micelar-vitamina-c-hidraderm-100ml/p
[4/31] ok - https://www.farmax.com.br/agua-termal-hidraderm-ciclos-100ml/p
[5/31] ok - https://www.farmax.com.br/contorno-de-olhos-hidraderm-ciclos-15g/p
[6/31] ok - https://www.farmax.com.br/creme-hidratante-antissinais-hidraderm-ciclos-60g/p
[7/31] ok - https://www.farmax.com.br/esfoliante-labial-hidraderm-ciclos-10g/p
[8/31] ok - https://www.farmax.com.br/gel-de-limpeza-facial-vitamina-c-hidraderm-120g/p
[9/31] ok - https://www.farmax.com.br/gel-de-limpeza-facial-vitamina-c-hidraderm-60g/p
[10/31] ok - https://www.farmax.com.br/gel-hidratante-facial-antioleosidade-hidraderm-100g/p
[11/31] ok - https://www.farmax.com.br/mascara-facial-argila-hidraderm-ciclos-60g/p
[12/31] ok - https://www.farmax.com.br/mascara-facial-detox